diff options
Diffstat (limited to 'libcpp')
-rw-r--r-- | libcpp/charset.cc | 194 |
1 files changed, 177 insertions, 17 deletions
diff --git a/libcpp/charset.cc b/libcpp/charset.cc index d5a0275..9a944d9 100644 --- a/libcpp/charset.cc +++ b/libcpp/charset.cc @@ -446,6 +446,73 @@ one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp, return 0; } + +/* Special routine which just counts number of characters in the + string, what exactly is stored into the output doesn't matter + as long as it is one uchar per character. */ + +static inline int +one_count_chars (iconv_t, const uchar **inbufp, size_t *inbytesleftp, + uchar **outbufp, size_t *outbytesleftp) +{ + cppchar_t s = 0; + int rval; + + /* Check for space first, since we know exactly how much we need. */ + if (*outbytesleftp < 1) + return E2BIG; + +#if HOST_CHARSET == HOST_CHARSET_ASCII + rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s); + if (rval) + return rval; +#else + if (*inbytesleftp < 1) + return EINVAL; + static const uchar utf_ebcdic_map[256] = { + /* See table 4 in http://unicode.org/reports/tr16/tr16-7.2.html */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, + 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1, + 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, + 9, 9, 9, 9, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4, + 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 5, 5, 5, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 6, 6, 7, 7, 0 + }; + rval = utf_ebcdic_map[**inbufp]; + if (rval == 9) + return EILSEQ; + if (rval == 0) + rval = 1; + if (rval >= 2) + { + if (*inbytesleftp < rval) + return EINVAL; + for (int i = 1; i < rval; ++i) + if (utf_ebcdic_map[(*inbufp)[i]] != 9) + return EILSEQ; + } + *inbytesleftp -= rval; + *inbufp += rval; +#endif + + **outbufp = ' '; + + *outbufp += 1; + *outbytesleftp -= 1; + return 0; +} + + /* Helper routine for the next few functions. The 'const' on one_conversion means that we promise not to modify what function is pointed to, which lets the inliner see through it. */ @@ -529,6 +596,15 @@ convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen, return conversion_loop (one_utf32_to_utf8, cd, from, flen, to); } +/* Magic conversion which just counts characters from input, so + only to->len is significant. */ +static bool +convert_count_chars (iconv_t cd, const uchar *from, + size_t flen, struct _cpp_strbuf *to) +{ + return conversion_loop (one_count_chars, cd, from, flen, to); +} + /* Identity conversion, used when we have no alternative. */ static bool convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED, @@ -2574,21 +2650,49 @@ cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from, } +/* Return number of source characters in STR. */ +static unsigned +count_source_chars (cpp_reader *pfile, cpp_string str, cpp_ttype type) +{ + cpp_string str2 = { 0, 0 }; + bool (*saved_diagnostic_handler) (cpp_reader *, enum cpp_diagnostic_level, + enum cpp_warning_reason, rich_location *, + const char *, va_list *) + ATTRIBUTE_FPTR_PRINTF(5,0); + saved_diagnostic_handler = pfile->cb.diagnostic; + pfile->cb.diagnostic = noop_diagnostic_cb; + convert_f save_func = pfile->narrow_cset_desc.func; + pfile->narrow_cset_desc.func = convert_count_chars; + bool ret = cpp_interpret_string (pfile, &str, 1, &str2, type); + pfile->narrow_cset_desc.func = save_func; + pfile->cb.diagnostic = saved_diagnostic_handler; + if (ret) + { + if (str2.text != str.text) + free ((void *)str2.text); + return str2.len; + } + else + return 0; +} + /* Subroutine of cpp_interpret_charconst which performs the conversion to a number, for narrow strings. STR is the string structure returned by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for - cpp_interpret_charconst. TYPE is the token type. */ + cpp_interpret_charconst. TOKEN is the token. */ static cppchar_t narrow_str_to_charconst (cpp_reader *pfile, cpp_string str, unsigned int *pchars_seen, int *unsignedp, - enum cpp_ttype type) + const cpp_token *token) { + enum cpp_ttype type = token->type; size_t width = CPP_OPTION (pfile, char_precision); size_t max_chars = CPP_OPTION (pfile, int_precision) / width; size_t mask = width_to_mask (width); size_t i; cppchar_t result, c; bool unsigned_p; + bool diagnosed = false; /* The value of a multi-character character constant, or a single-character character constant whose representation in the @@ -2612,11 +2716,55 @@ narrow_str_to_charconst (cpp_reader *pfile, cpp_string str, if (type == CPP_UTF8CHAR) max_chars = 1; - if (i > max_chars) + else if (i > 1 && CPP_OPTION (pfile, cplusplus) && CPP_PEDANTIC (pfile)) { + /* C++ as a DR since + P1854R4 - Making non-encodable string literals ill-formed + makes multi-character narrow character literals if any of the + characters in the literal isn't encodable in char/unsigned char + ill-formed. We need to count the number of c-chars and compare + that to str.len. */ + unsigned src_chars = count_source_chars (pfile, token->val.str, type); + + if (src_chars) + { + if (str.len > src_chars) + { + if (src_chars <= 2) + diagnosed + = cpp_error (pfile, CPP_DL_PEDWARN, + "character not encodable in a single execution " + "character code unit"); + else + diagnosed + = cpp_error (pfile, CPP_DL_PEDWARN, + "at least one character in a multi-character " + "literal not encodable in a single execution " + "character code unit"); + if (diagnosed && i > max_chars) + i = max_chars; + } + } + } + if (diagnosed) + /* Already diagnosed above. */; + else if (i > max_chars) + { + unsigned src_chars + = count_source_chars (pfile, token->val.str, + type == CPP_UTF8CHAR ? CPP_CHAR : type); + + if (type != CPP_UTF8CHAR) + cpp_error (pfile, CPP_DL_WARNING, + "multi-character literal with %ld characters exceeds " + "'int' size of %ld bytes", (long) i, (long) max_chars); + else if (src_chars > 2) + cpp_error (pfile, CPP_DL_ERROR, + "multi-character literal cannot have an encoding prefix"); + else + cpp_error (pfile, CPP_DL_ERROR, + "character not encodable in a single code unit"); i = max_chars; - cpp_error (pfile, type == CPP_UTF8CHAR ? CPP_DL_ERROR : CPP_DL_WARNING, - "character constant too long for its type"); } else if (i > 1 && CPP_OPTION (pfile, warn_multichar)) cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant"); @@ -2651,12 +2799,13 @@ narrow_str_to_charconst (cpp_reader *pfile, cpp_string str, /* Subroutine of cpp_interpret_charconst which performs the conversion to a number, for wide strings. STR is the string structure returned by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for - cpp_interpret_charconst. TYPE is the token type. */ + cpp_interpret_charconst. TOKEN is the token. */ static cppchar_t wide_str_to_charconst (cpp_reader *pfile, cpp_string str, unsigned int *pchars_seen, int *unsignedp, - enum cpp_ttype type) + const cpp_token *token) { + enum cpp_ttype type = token->type; bool bigend = CPP_OPTION (pfile, bytes_big_endian); size_t width = converter_for_type (pfile, type).width; size_t cwidth = CPP_OPTION (pfile, char_precision); @@ -2692,14 +2841,25 @@ wide_str_to_charconst (cpp_reader *pfile, cpp_string str, character exactly fills a wchar_t, so a multi-character wide character constant is guaranteed to overflow. */ if (str.len > nbwc * 2) - cpp_error (pfile, (CPP_OPTION (pfile, cplusplus) - && (type == CPP_CHAR16 - || type == CPP_CHAR32 - /* In C++23 this is error even for L'ab'. */ - || (type == CPP_WCHAR - && CPP_OPTION (pfile, size_t_literals)))) - ? CPP_DL_ERROR : CPP_DL_WARNING, - "character constant too long for its type"); + { + cpp_diagnostic_level level = CPP_DL_WARNING; + unsigned src_chars + = count_source_chars (pfile, token->val.str, CPP_CHAR); + + if (CPP_OPTION (pfile, cplusplus) + && (type == CPP_CHAR16 + || type == CPP_CHAR32 + /* In C++23 this is error even for L'ab'. */ + || (type == CPP_WCHAR + && CPP_OPTION (pfile, size_t_literals)))) + level = CPP_DL_ERROR; + if (src_chars > 2) + cpp_error (pfile, level, + "multi-character literal cannot have an encoding prefix"); + else + cpp_error (pfile, level, + "character not encodable in a single code unit"); + } /* Truncate the constant to its natural width, and simultaneously sign- or zero-extend to the full width of cppchar_t. */ @@ -2754,10 +2914,10 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token, if (wide) result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp, - token->type); + token); else result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp, - token->type); + token); if (str.text != token->val.str.text) free ((void *)str.text); |