1 files changed, 177 insertions, 17 deletions
diff --git a/libcpp/charset.cc b/libcpp/charset.cc
index d5a0275..9a944d9 100644
--- a/libcpp/charset.cc
+++ b/libcpp/charset.cc
@@ -446,6 +446,73 @@ one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
   return 0;
 }
 
+
+/* Special routine which just counts number of characters in the
+   string, what exactly is stored into the output doesn't matter
+   as long as it is one uchar per character.  */
+
+static inline int
+one_count_chars (iconv_t, const uchar **inbufp, size_t *inbytesleftp,
+		 uchar **outbufp, size_t *outbytesleftp)
+{
+  cppchar_t s = 0;
+  int rval;
+
+  /* Check for space first, since we know exactly how much we need.  */
+  if (*outbytesleftp < 1)
+    return E2BIG;
+
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
+  if (rval)
+    return rval;
+#else
+  if (*inbytesleftp < 1)
+    return EINVAL;
+  static const uchar utf_ebcdic_map[256] = {
+    /* See table 4 in http://unicode.org/reports/tr16/tr16-7.2.html  */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
+    1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1,
+    1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
+    9, 9, 9, 9, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+    2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+    2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+    2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4,
+    1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 5, 5, 5,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 6, 6, 7, 7, 0
+  };
+  rval = utf_ebcdic_map[**inbufp];
+  if (rval == 9)
+    return EILSEQ;
+  if (rval == 0)
+    rval = 1;
+  if (rval >= 2)
+    {
+      if (*inbytesleftp < rval)
+	return EINVAL;
+      for (int i = 1; i < rval; ++i)
+	if (utf_ebcdic_map[(*inbufp)[i]] != 9)
+	  return EILSEQ;
+    }
+  *inbytesleftp -= rval;
+  *inbufp += rval;
+#endif
+
+  **outbufp = ' ';
+
+  *outbufp += 1;
+  *outbytesleftp -= 1;
+  return 0;
+}
+
+
 /* Helper routine for the next few functions.  The 'const' on
    one_conversion means that we promise not to modify what function is
    pointed to, which lets the inliner see through it.  */
@@ -529,6 +596,15 @@ convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
   return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
 }
 
+/* Magic conversion which just counts characters from input, so
+   only to->len is significant.  */
+static bool
+convert_count_chars (iconv_t cd, const uchar *from,
+		     size_t flen, struct _cpp_strbuf *to)
+{
+  return conversion_loop (one_count_chars, cd, from, flen, to);
+}
+
 /* Identity conversion, used when we have no alternative.  */
 static bool
 convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
@@ -2574,21 +2650,49 @@ cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
 }
 
 
+/* Return number of source characters in STR.  */
+static unsigned
+count_source_chars (cpp_reader *pfile, cpp_string str, cpp_ttype type)
+{
+  cpp_string str2 = { 0, 0 };
+  bool (*saved_diagnostic_handler) (cpp_reader *, enum cpp_diagnostic_level,
+				    enum cpp_warning_reason, rich_location *,
+				    const char *, va_list *)
+    ATTRIBUTE_FPTR_PRINTF(5,0);
+  saved_diagnostic_handler = pfile->cb.diagnostic;
+  pfile->cb.diagnostic = noop_diagnostic_cb;
+  convert_f save_func = pfile->narrow_cset_desc.func;
+  pfile->narrow_cset_desc.func = convert_count_chars;
+  bool ret = cpp_interpret_string (pfile, &str, 1, &str2, type);
+  pfile->narrow_cset_desc.func = save_func;
+  pfile->cb.diagnostic = saved_diagnostic_handler;
+  if (ret)
+    {
+      if (str2.text != str.text)
+	free ((void *)str2.text);
+      return str2.len;
+    }
+  else
+    return 0;
+}
+
 /* Subroutine of cpp_interpret_charconst which performs the conversion
    to a number, for narrow strings.  STR is the string structure returned
    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
-   cpp_interpret_charconst.  TYPE is the token type.  */
+   cpp_interpret_charconst.  TOKEN is the token.  */
 static cppchar_t
 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
 			 unsigned int *pchars_seen, int *unsignedp,
-			 enum cpp_ttype type)
+			 const cpp_token *token)
 {
+  enum cpp_ttype type = token->type;
   size_t width = CPP_OPTION (pfile, char_precision);
   size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
   size_t mask = width_to_mask (width);
   size_t i;
   cppchar_t result, c;
   bool unsigned_p;
+  bool diagnosed = false;
 
   /* The value of a multi-character character constant, or a
      single-character character constant whose representation in the
@@ -2612,11 +2716,55 @@ narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
 
   if (type == CPP_UTF8CHAR)
     max_chars = 1;
-  if (i > max_chars)
+  else if (i > 1 && CPP_OPTION (pfile, cplusplus) && CPP_PEDANTIC (pfile))
     {
+      /* C++ as a DR since
+	 P1854R4 - Making non-encodable string literals ill-formed
+	 makes multi-character narrow character literals if any of the
+	 characters in the literal isn't encodable in char/unsigned char
+	 ill-formed.  We need to count the number of c-chars and compare
+	 that to str.len.  */
+      unsigned src_chars = count_source_chars (pfile, token->val.str, type);
+
+      if (src_chars)
+	{
+	  if (str.len > src_chars)
+	    {
+	      if (src_chars <= 2)
+		diagnosed
+		  = cpp_error (pfile, CPP_DL_PEDWARN,
+			       "character not encodable in a single execution "
+			       "character code unit");
+	      else
+		diagnosed
+		  = cpp_error (pfile, CPP_DL_PEDWARN,
+			       "at least one character in a multi-character "
+			       "literal not encodable in a single execution "
+			       "character code unit");
+	      if (diagnosed && i > max_chars)
+		i = max_chars;
+	    }
+	}
+    }
+  if (diagnosed)
+    /* Already diagnosed above.  */;
+  else if (i > max_chars)
+    {
+      unsigned src_chars
+	= count_source_chars (pfile, token->val.str,
+			      type == CPP_UTF8CHAR ? CPP_CHAR : type);
+
+      if (type != CPP_UTF8CHAR)
+	cpp_error (pfile, CPP_DL_WARNING,
+		   "multi-character literal with %ld characters exceeds "
+		   "'int' size of %ld bytes", (long) i, (long) max_chars);
+      else if (src_chars > 2)
+	cpp_error (pfile, CPP_DL_ERROR,
+		   "multi-character literal cannot have an encoding prefix");
+      else
+	cpp_error (pfile, CPP_DL_ERROR,
+		   "character not encodable in a single code unit");
       i = max_chars;
-      cpp_error (pfile, type == CPP_UTF8CHAR ? CPP_DL_ERROR : CPP_DL_WARNING,
-		 "character constant too long for its type");
     }
   else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
     cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant");
@@ -2651,12 +2799,13 @@ narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
 /* Subroutine of cpp_interpret_charconst which performs the conversion
    to a number, for wide strings.  STR is the string structure returned
    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
-   cpp_interpret_charconst.  TYPE is the token type.  */
+   cpp_interpret_charconst.  TOKEN is the token.  */
 static cppchar_t
 wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
 		       unsigned int *pchars_seen, int *unsignedp,
-		       enum cpp_ttype type)
+		       const cpp_token *token)
 {
+  enum cpp_ttype type = token->type;
   bool bigend = CPP_OPTION (pfile, bytes_big_endian);
   size_t width = converter_for_type (pfile, type).width;
   size_t cwidth = CPP_OPTION (pfile, char_precision);
@@ -2692,14 +2841,25 @@ wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
      character exactly fills a wchar_t, so a multi-character wide
      character constant is guaranteed to overflow.  */
   if (str.len > nbwc * 2)
-    cpp_error (pfile, (CPP_OPTION (pfile, cplusplus)
-		       && (type == CPP_CHAR16
-			   || type == CPP_CHAR32
-			   /* In C++23 this is error even for L'ab'.  */
-			   || (type == CPP_WCHAR
-			       && CPP_OPTION (pfile, size_t_literals))))
-		      ? CPP_DL_ERROR : CPP_DL_WARNING,
-	       "character constant too long for its type");
+    {
+      cpp_diagnostic_level level = CPP_DL_WARNING;
+      unsigned src_chars
+	= count_source_chars (pfile, token->val.str, CPP_CHAR);
+
+      if (CPP_OPTION (pfile, cplusplus)
+	  && (type == CPP_CHAR16
+	      || type == CPP_CHAR32
+	      /* In C++23 this is error even for L'ab'.  */
+	      || (type == CPP_WCHAR
+		  && CPP_OPTION (pfile, size_t_literals))))
+	level = CPP_DL_ERROR;
+      if (src_chars > 2)
+	cpp_error (pfile, level,
+		   "multi-character literal cannot have an encoding prefix");
+      else
+	cpp_error (pfile, level,
+		   "character not encodable in a single code unit");
+    }
 
   /* Truncate the constant to its natural width, and simultaneously
      sign- or zero-extend to the full width of cppchar_t.  */
@@ -2754,10 +2914,10 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
 
   if (wide)
     result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
-				    token->type);
+				    token);
   else
     result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp,
-				      token->type);
+				      token);
 
   if (str.text != token->val.str.text)
     free ((void *)str.text);