aboutsummaryrefslogtreecommitdiff
path: root/libcpp
diff options
context:
space:
mode:
Diffstat (limited to 'libcpp')
-rw-r--r--libcpp/charset.cc27
-rw-r--r--libcpp/include/cpplib.h1
2 files changed, 28 insertions, 0 deletions
diff --git a/libcpp/charset.cc b/libcpp/charset.cc
index 3c47d4f..d7f323b 100644
--- a/libcpp/charset.cc
+++ b/libcpp/charset.cc
@@ -1864,6 +1864,33 @@ _cpp_valid_utf8 (cpp_reader *pfile,
return true;
}
+/* Return true iff BUFFER of size NUM_BYTES is validly-encoded UTF-8. */
+
+extern bool
+cpp_valid_utf8_p (const char *buffer, size_t num_bytes)
+{
+ const uchar *iter = (const uchar *)buffer;
+ size_t bytesleft = num_bytes;
+ while (bytesleft > 0)
+ {
+ /* one_utf8_to_cppchar implements 5-byte and 6 byte sequences as per
+ RFC 2279, but this has been superceded by RFC 3629, which
+ restricts UTF-8 to 1-byte through 4-byte sequences, and
+ states "the octet values C0, C1, F5 to FF never appear".
+
+ Reject such values. */
+ if (*iter >= 0xf4)
+ return false;
+
+ cppchar_t cp;
+ int err = one_utf8_to_cppchar (&iter, &bytesleft, &cp);
+ if (err)
+ return false;
+ }
+ /* No problems encountered. */
+ return true;
+}
+
/* Subroutine of convert_hex and convert_oct. N is the representation
in the execution character set of a numeric escape; write it into the
string buffer TBUF and update the end-of-string pointer therein. WIDE
diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
index 8df071e..a6f0abd 100644
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -1600,5 +1600,6 @@ int cpp_wcwidth (cppchar_t c);
bool cpp_input_conversion_is_trivial (const char *input_charset);
int cpp_check_utf8_bom (const char *data, size_t data_length);
+bool cpp_valid_utf8_p (const char *data, size_t num_bytes);
#endif /* ! LIBCPP_CPPLIB_H */