diff options
Diffstat (limited to 'libcpp')
-rw-r--r-- | libcpp/charset.cc | 27 | ||||
-rw-r--r-- | libcpp/include/cpplib.h | 1 |
2 files changed, 28 insertions, 0 deletions
diff --git a/libcpp/charset.cc b/libcpp/charset.cc index 3c47d4f..d7f323b 100644 --- a/libcpp/charset.cc +++ b/libcpp/charset.cc @@ -1864,6 +1864,33 @@ _cpp_valid_utf8 (cpp_reader *pfile, return true; } +/* Return true iff BUFFER of size NUM_BYTES is validly-encoded UTF-8. */ + +extern bool +cpp_valid_utf8_p (const char *buffer, size_t num_bytes) +{ + const uchar *iter = (const uchar *)buffer; + size_t bytesleft = num_bytes; + while (bytesleft > 0) + { + /* one_utf8_to_cppchar implements 5-byte and 6 byte sequences as per + RFC 2279, but this has been superceded by RFC 3629, which + restricts UTF-8 to 1-byte through 4-byte sequences, and + states "the octet values C0, C1, F5 to FF never appear". + + Reject such values. */ + if (*iter >= 0xf4) + return false; + + cppchar_t cp; + int err = one_utf8_to_cppchar (&iter, &bytesleft, &cp); + if (err) + return false; + } + /* No problems encountered. */ + return true; +} + /* Subroutine of convert_hex and convert_oct. N is the representation in the execution character set of a numeric escape; write it into the string buffer TBUF and update the end-of-string pointer therein. WIDE diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index 8df071e..a6f0abd 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -1600,5 +1600,6 @@ int cpp_wcwidth (cppchar_t c); bool cpp_input_conversion_is_trivial (const char *input_charset); int cpp_check_utf8_bom (const char *data, size_t data_length); +bool cpp_valid_utf8_p (const char *data, size_t num_bytes); #endif /* ! LIBCPP_CPPLIB_H */ |