aboutsummaryrefslogtreecommitdiff
path: root/libcpp/charset.cc
diff options
context:
space:
mode:
authorThomas Schwinge <thomas@codesourcery.com>2023-04-07 22:04:54 +0200
committerThomas Schwinge <thomas_schwinge@mentor.com>2023-04-07 22:04:54 +0200
commitb1d131f8941cd30ee7904698e7d3bcc20686b61c (patch)
tree3dcb318ea104daa2fcc771c5e3de080414c45730 /libcpp/charset.cc
parent353f5e8f00a2f897d2974f07d27dd10f79666889 (diff)
parent939fb3fffebf7be2638ddef25dbe1c15f2aaa9d6 (diff)
downloadgcc-b1d131f8941cd30ee7904698e7d3bcc20686b61c.zip
gcc-b1d131f8941cd30ee7904698e7d3bcc20686b61c.tar.gz
gcc-b1d131f8941cd30ee7904698e7d3bcc20686b61c.tar.bz2
Merge commit '939fb3fffebf7be2638ddef25dbe1c15f2aaa9d6' into HEAD
Diffstat (limited to 'libcpp/charset.cc')
-rw-r--r--libcpp/charset.cc27
1 files changed, 27 insertions, 0 deletions
diff --git a/libcpp/charset.cc b/libcpp/charset.cc
index 3c47d4f..d7f323b 100644
--- a/libcpp/charset.cc
+++ b/libcpp/charset.cc
@@ -1864,6 +1864,33 @@ _cpp_valid_utf8 (cpp_reader *pfile,
return true;
}
+/* Return true iff BUFFER of size NUM_BYTES is validly-encoded UTF-8. */
+
+extern bool
+cpp_valid_utf8_p (const char *buffer, size_t num_bytes)
+{
+ const uchar *iter = (const uchar *)buffer;
+ size_t bytesleft = num_bytes;
+ while (bytesleft > 0)
+ {
+ /* one_utf8_to_cppchar implements 5-byte and 6 byte sequences as per
+ RFC 2279, but this has been superceded by RFC 3629, which
+ restricts UTF-8 to 1-byte through 4-byte sequences, and
+ states "the octet values C0, C1, F5 to FF never appear".
+
+ Reject such values. */
+ if (*iter >= 0xf4)
+ return false;
+
+ cppchar_t cp;
+ int err = one_utf8_to_cppchar (&iter, &bytesleft, &cp);
+ if (err)
+ return false;
+ }
+ /* No problems encountered. */
+ return true;
+}
+
/* Subroutine of convert_hex and convert_oct. N is the representation
in the execution character set of a numeric escape; write it into the
string buffer TBUF and update the end-of-string pointer therein. WIDE