aboutsummaryrefslogtreecommitdiff
path: root/libcpp
diff options
context:
space:
mode:
Diffstat (limited to 'libcpp')
-rw-r--r--libcpp/charset.cc194
1 files changed, 177 insertions, 17 deletions
diff --git a/libcpp/charset.cc b/libcpp/charset.cc
index d5a0275..9a944d9 100644
--- a/libcpp/charset.cc
+++ b/libcpp/charset.cc
@@ -446,6 +446,73 @@ one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
return 0;
}
+
+/* Special routine which just counts number of characters in the
+ string, what exactly is stored into the output doesn't matter
+ as long as it is one uchar per character. */
+
+static inline int
+one_count_chars (iconv_t, const uchar **inbufp, size_t *inbytesleftp,
+ uchar **outbufp, size_t *outbytesleftp)
+{
+ cppchar_t s = 0;
+ int rval;
+
+ /* Check for space first, since we know exactly how much we need. */
+ if (*outbytesleftp < 1)
+ return E2BIG;
+
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+ rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
+ if (rval)
+ return rval;
+#else
+ if (*inbytesleftp < 1)
+ return EINVAL;
+ static const uchar utf_ebcdic_map[256] = {
+ /* See table 4 in http://unicode.org/reports/tr16/tr16-7.2.html */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
+ 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1,
+ 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
+ 9, 9, 9, 9, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4,
+ 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 5, 5, 5,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 6, 6, 7, 7, 0
+ };
+ rval = utf_ebcdic_map[**inbufp];
+ if (rval == 9)
+ return EILSEQ;
+ if (rval == 0)
+ rval = 1;
+ if (rval >= 2)
+ {
+ if (*inbytesleftp < rval)
+ return EINVAL;
+ for (int i = 1; i < rval; ++i)
+ if (utf_ebcdic_map[(*inbufp)[i]] != 9)
+ return EILSEQ;
+ }
+ *inbytesleftp -= rval;
+ *inbufp += rval;
+#endif
+
+ **outbufp = ' ';
+
+ *outbufp += 1;
+ *outbytesleftp -= 1;
+ return 0;
+}
+
+
/* Helper routine for the next few functions. The 'const' on
one_conversion means that we promise not to modify what function is
pointed to, which lets the inliner see through it. */
@@ -529,6 +596,15 @@ convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
}
+/* Magic conversion which just counts characters from input, so
+ only to->len is significant. */
+static bool
+convert_count_chars (iconv_t cd, const uchar *from,
+ size_t flen, struct _cpp_strbuf *to)
+{
+ return conversion_loop (one_count_chars, cd, from, flen, to);
+}
+
/* Identity conversion, used when we have no alternative. */
static bool
convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
@@ -2574,21 +2650,49 @@ cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
}
+/* Return number of source characters in STR. */
+static unsigned
+count_source_chars (cpp_reader *pfile, cpp_string str, cpp_ttype type)
+{
+ cpp_string str2 = { 0, 0 };
+ bool (*saved_diagnostic_handler) (cpp_reader *, enum cpp_diagnostic_level,
+ enum cpp_warning_reason, rich_location *,
+ const char *, va_list *)
+ ATTRIBUTE_FPTR_PRINTF(5,0);
+ saved_diagnostic_handler = pfile->cb.diagnostic;
+ pfile->cb.diagnostic = noop_diagnostic_cb;
+ convert_f save_func = pfile->narrow_cset_desc.func;
+ pfile->narrow_cset_desc.func = convert_count_chars;
+ bool ret = cpp_interpret_string (pfile, &str, 1, &str2, type);
+ pfile->narrow_cset_desc.func = save_func;
+ pfile->cb.diagnostic = saved_diagnostic_handler;
+ if (ret)
+ {
+ if (str2.text != str.text)
+ free ((void *)str2.text);
+ return str2.len;
+ }
+ else
+ return 0;
+}
+
/* Subroutine of cpp_interpret_charconst which performs the conversion
to a number, for narrow strings. STR is the string structure returned
by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
- cpp_interpret_charconst. TYPE is the token type. */
+ cpp_interpret_charconst. TOKEN is the token. */
static cppchar_t
narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
unsigned int *pchars_seen, int *unsignedp,
- enum cpp_ttype type)
+ const cpp_token *token)
{
+ enum cpp_ttype type = token->type;
size_t width = CPP_OPTION (pfile, char_precision);
size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
size_t mask = width_to_mask (width);
size_t i;
cppchar_t result, c;
bool unsigned_p;
+ bool diagnosed = false;
/* The value of a multi-character character constant, or a
single-character character constant whose representation in the
@@ -2612,11 +2716,55 @@ narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
if (type == CPP_UTF8CHAR)
max_chars = 1;
- if (i > max_chars)
+ else if (i > 1 && CPP_OPTION (pfile, cplusplus) && CPP_PEDANTIC (pfile))
{
+ /* C++ as a DR since
+ P1854R4 - Making non-encodable string literals ill-formed
+ makes multi-character narrow character literals if any of the
+ characters in the literal isn't encodable in char/unsigned char
+ ill-formed. We need to count the number of c-chars and compare
+ that to str.len. */
+ unsigned src_chars = count_source_chars (pfile, token->val.str, type);
+
+ if (src_chars)
+ {
+ if (str.len > src_chars)
+ {
+ if (src_chars <= 2)
+ diagnosed
+ = cpp_error (pfile, CPP_DL_PEDWARN,
+ "character not encodable in a single execution "
+ "character code unit");
+ else
+ diagnosed
+ = cpp_error (pfile, CPP_DL_PEDWARN,
+ "at least one character in a multi-character "
+ "literal not encodable in a single execution "
+ "character code unit");
+ if (diagnosed && i > max_chars)
+ i = max_chars;
+ }
+ }
+ }
+ if (diagnosed)
+ /* Already diagnosed above. */;
+ else if (i > max_chars)
+ {
+ unsigned src_chars
+ = count_source_chars (pfile, token->val.str,
+ type == CPP_UTF8CHAR ? CPP_CHAR : type);
+
+ if (type != CPP_UTF8CHAR)
+ cpp_error (pfile, CPP_DL_WARNING,
+ "multi-character literal with %ld characters exceeds "
+ "'int' size of %ld bytes", (long) i, (long) max_chars);
+ else if (src_chars > 2)
+ cpp_error (pfile, CPP_DL_ERROR,
+ "multi-character literal cannot have an encoding prefix");
+ else
+ cpp_error (pfile, CPP_DL_ERROR,
+ "character not encodable in a single code unit");
i = max_chars;
- cpp_error (pfile, type == CPP_UTF8CHAR ? CPP_DL_ERROR : CPP_DL_WARNING,
- "character constant too long for its type");
}
else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant");
@@ -2651,12 +2799,13 @@ narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
/* Subroutine of cpp_interpret_charconst which performs the conversion
to a number, for wide strings. STR is the string structure returned
by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
- cpp_interpret_charconst. TYPE is the token type. */
+ cpp_interpret_charconst. TOKEN is the token. */
static cppchar_t
wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
unsigned int *pchars_seen, int *unsignedp,
- enum cpp_ttype type)
+ const cpp_token *token)
{
+ enum cpp_ttype type = token->type;
bool bigend = CPP_OPTION (pfile, bytes_big_endian);
size_t width = converter_for_type (pfile, type).width;
size_t cwidth = CPP_OPTION (pfile, char_precision);
@@ -2692,14 +2841,25 @@ wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
character exactly fills a wchar_t, so a multi-character wide
character constant is guaranteed to overflow. */
if (str.len > nbwc * 2)
- cpp_error (pfile, (CPP_OPTION (pfile, cplusplus)
- && (type == CPP_CHAR16
- || type == CPP_CHAR32
- /* In C++23 this is error even for L'ab'. */
- || (type == CPP_WCHAR
- && CPP_OPTION (pfile, size_t_literals))))
- ? CPP_DL_ERROR : CPP_DL_WARNING,
- "character constant too long for its type");
+ {
+ cpp_diagnostic_level level = CPP_DL_WARNING;
+ unsigned src_chars
+ = count_source_chars (pfile, token->val.str, CPP_CHAR);
+
+ if (CPP_OPTION (pfile, cplusplus)
+ && (type == CPP_CHAR16
+ || type == CPP_CHAR32
+ /* In C++23 this is error even for L'ab'. */
+ || (type == CPP_WCHAR
+ && CPP_OPTION (pfile, size_t_literals))))
+ level = CPP_DL_ERROR;
+ if (src_chars > 2)
+ cpp_error (pfile, level,
+ "multi-character literal cannot have an encoding prefix");
+ else
+ cpp_error (pfile, level,
+ "character not encodable in a single code unit");
+ }
/* Truncate the constant to its natural width, and simultaneously
sign- or zero-extend to the full width of cppchar_t. */
@@ -2754,10 +2914,10 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
if (wide)
result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
- token->type);
+ token);
else
result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp,
- token->type);
+ token);
if (str.text != token->val.str.text)
free ((void *)str.text);