aboutsummaryrefslogtreecommitdiff
path: root/libcpp/charset.cc
diff options
context:
space:
mode:
authorThomas Schwinge <thomas@codesourcery.com>2023-04-10 10:59:48 +0200
committerThomas Schwinge <thomas@codesourcery.com>2023-04-10 10:59:48 +0200
commit3757e8d71794cece4a5c9d08245b7ad111044853 (patch)
tree778c6eb8324fa714713ce30a0897e44cf276508b /libcpp/charset.cc
parente44f127cdb12a28536fe21983dfad20570bceda0 (diff)
parent6baa95c9c5b3fea96fd22d03d961db4e4cf48d88 (diff)
downloadgcc-3757e8d71794cece4a5c9d08245b7ad111044853.zip
gcc-3757e8d71794cece4a5c9d08245b7ad111044853.tar.gz
gcc-3757e8d71794cece4a5c9d08245b7ad111044853.tar.bz2
Merge commit '6baa95c9c5b3fea96fd22d03d961db4e4cf48d88' into HEAD [#2112]
Diffstat (limited to 'libcpp/charset.cc')
-rw-r--r--libcpp/charset.cc27
1 files changed, 27 insertions, 0 deletions
diff --git a/libcpp/charset.cc b/libcpp/charset.cc
index 3c47d4f..d7f323b 100644
--- a/libcpp/charset.cc
+++ b/libcpp/charset.cc
@@ -1864,6 +1864,33 @@ _cpp_valid_utf8 (cpp_reader *pfile,
return true;
}
+/* Return true iff BUFFER of size NUM_BYTES is validly-encoded UTF-8. */
+
+extern bool
+cpp_valid_utf8_p (const char *buffer, size_t num_bytes)
+{
+ const uchar *iter = (const uchar *)buffer;
+ size_t bytesleft = num_bytes;
+ while (bytesleft > 0)
+ {
+ /* one_utf8_to_cppchar implements 5-byte and 6 byte sequences as per
+ RFC 2279, but this has been superceded by RFC 3629, which
+ restricts UTF-8 to 1-byte through 4-byte sequences, and
+ states "the octet values C0, C1, F5 to FF never appear".
+
+ Reject such values. */
+ if (*iter >= 0xf4)
+ return false;
+
+ cppchar_t cp;
+ int err = one_utf8_to_cppchar (&iter, &bytesleft, &cp);
+ if (err)
+ return false;
+ }
+ /* No problems encountered. */
+ return true;
+}
+
/* Subroutine of convert_hex and convert_oct. N is the representation
in the execution character set of a numeric escape; write it into the
string buffer TBUF and update the end-of-string pointer therein. WIDE