c++: Implement C++26 P1854R4 - Making non-encodable string literals ill-formed [PR110341]

This paper voted in as DR makes some multi-character literals ill-formed. 'abcd' stays valid, but e.g. 'á' is newly invalid in UTF-8 exec charset while valid e.g. in ISO-8859-1, because it is a single character which needs 2 bytes to be encoded. The following patch does that by checking (only pedantically, especially because it is a DR) if we'd emit a -Wmultichar warning because character constant has more than one byte in it whether the number of source characters is equal to the number of bytes in the multichar string. If it is, it is normal multi-character literal constant and is diagnosed normally with -Wmultichar, otherwise at least one of the c-chars in the sequence was encoded as 2+ bytes. 2023-11-14 Jakub Jelinek <jakub@redhat.com> PR c++/110341 libcpp/ * charset.cc: Implement C++26 P1854R4 - Making non-encodable string literals ill-formed. (one_count_chars, convert_count_chars, count_source_chars): New functions. (narrow_str_to_charconst): Change last arg type from cpp_ttype to const cpp_token *. For C++ if pedantic and i > 1 in CPP_CHAR interpret token also as CPP_STRING32 and if number of characters in the CPP_STRING32 is larger than number of bytes in CPP_CHAR, pedwarn on it. Make the diagnostics more detailed. (wide_str_to_charconst): Change last arg type from cpp_ttype to const cpp_token *. Make the diagnostics more detailed. (cpp_interpret_charconst): Adjust narrow_str_to_charconst and wide_str_to_charconst callers. gcc/testsuite/ * g++.dg/cpp26/literals1.C: New test. * g++.dg/cpp26/literals2.C: New test. * g++.dg/cpp23/wchar-multi1.C: Adjust expected diagnostic wordings. * g++.dg/cpp23/wchar-multi2.C: Likewise. * gcc.dg/c23-utf8char-3.c: Likewise. * gcc.dg/cpp/charconst-4.c: Likewise. * gcc.dg/cpp/charconst.c: Likewise. * gcc.dg/cpp/if-2.c: Likewise. * gcc.dg/utf16-4.c: Likewise. * gcc.dg/utf32-4.c: Likewise. * g++.dg/cpp1z/utf8-neg.C: Likewise. * g++.dg/cpp2a/ucn2.C: Likewise. * g++.dg/ext/utf16-4.C: Likewise. * g++.dg/ext/utf32-4.C: Likewise.
author: Jakub Jelinek <jakub@redhat.com> 2023-11-14 18:28:34 +0100
committer: Jakub Jelinek <jakub@redhat.com> 2023-11-14 18:28:34 +0100
commit: 194825f20619a1c4b51eaea84f20432fefc0db03 (patch)
tree: 93e0f44cfa40ba14f7585d7aee9464f25b3f15e7 /libcpp/charset.cc
parent: 948b8b6e0e50958ecf56d4d9fb7ac16f245d9cc3 (diff)
download: gcc-194825f20619a1c4b51eaea84f20432fefc0db03.zip
gcc-194825f20619a1c4b51eaea84f20432fefc0db03.tar.gz
gcc-194825f20619a1c4b51eaea84f20432fefc0db03.tar.bz2
1 files changed, 177 insertions, 17 deletions
diff --git a/libcpp/charset.cc b/libcpp/charset.cc
index d5a0275..9a944d9 100644
--- a/libcpp/charset.cc
+++ b/libcpp/charset.cc
@@ -446,6 +446,73 @@ one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
   return 0;
 }
 
+
+/* Special routine which just counts number of characters in the
+   string, what exactly is stored into the output doesn't matter
+   as long as it is one uchar per character.  */
+
+static inline int
+one_count_chars (iconv_t, const uchar **inbufp, size_t *inbytesleftp,
+		 uchar **outbufp, size_t *outbytesleftp)
+{
+  cppchar_t s = 0;
+  int rval;
+
+  /* Check for space first, since we know exactly how much we need.  */
+  if (*outbytesleftp < 1)
+    return E2BIG;
+
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
+  if (rval)
+    return rval;
+#else
+  if (*inbytesleftp < 1)
+    return EINVAL;
+  static const uchar utf_ebcdic_map[256] = {
+    /* See table 4 in http://unicode.org/reports/tr16/tr16-7.2.html  */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
+    1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1,
+    1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
+    9, 9, 9, 9, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+    2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+    2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+    2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4,
+    1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 5, 5, 5,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 6, 6, 7, 7, 0
+  };
+  rval = utf_ebcdic_map[**inbufp];
+  if (rval == 9)
+    return EILSEQ;
+  if (rval == 0)
+    rval = 1;
+  if (rval >= 2)
+    {
+      if (*inbytesleftp < rval)
+	return EINVAL;
+      for (int i = 1; i < rval; ++i)
+	if (utf_ebcdic_map[(*inbufp)[i]] != 9)
+	  return EILSEQ;
+    }
+  *inbytesleftp -= rval;
+  *inbufp += rval;
+#endif
+
+  **outbufp = ' ';
+
+  *outbufp += 1;
+  *outbytesleftp -= 1;
+  return 0;
+}
+
+
 /* Helper routine for the next few functions.  The 'const' on
    one_conversion means that we promise not to modify what function is
    pointed to, which lets the inliner see through it.  */
@@ -529,6 +596,15 @@ convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
   return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
 }
 
+/* Magic conversion which just counts characters from input, so
+   only to->len is significant.  */
+static bool
+convert_count_chars (iconv_t cd, const uchar *from,
+		     size_t flen, struct _cpp_strbuf *to)
+{
+  return conversion_loop (one_count_chars, cd, from, flen, to);
+}
+
 /* Identity conversion, used when we have no alternative.  */
 static bool
 convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
@@ -2574,21 +2650,49 @@ cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
 }
 
 
+/* Return number of source characters in STR.  */
+static unsigned
+count_source_chars (cpp_reader *pfile, cpp_string str, cpp_ttype type)
+{
+  cpp_string str2 = { 0, 0 };
+  bool (*saved_diagnostic_handler) (cpp_reader *, enum cpp_diagnostic_level,
+				    enum cpp_warning_reason, rich_location *,
+				    const char *, va_list *)
+    ATTRIBUTE_FPTR_PRINTF(5,0);
+  saved_diagnostic_handler = pfile->cb.diagnostic;
+  pfile->cb.diagnostic = noop_diagnostic_cb;
+  convert_f save_func = pfile->narrow_cset_desc.func;
+  pfile->narrow_cset_desc.func = convert_count_chars;
+  bool ret = cpp_interpret_string (pfile, &str, 1, &str2, type);
+  pfile->narrow_cset_desc.func = save_func;
+  pfile->cb.diagnostic = saved_diagnostic_handler;
+  if (ret)
+    {
+      if (str2.text != str.text)
+	free ((void *)str2.text);
+      return str2.len;
+    }
+  else
+    return 0;
+}
+
 /* Subroutine of cpp_interpret_charconst which performs the conversion
    to a number, for narrow strings.  STR is the string structure returned
    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
-   cpp_interpret_charconst.  TYPE is the token type.  */
+   cpp_interpret_charconst.  TOKEN is the token.  */
 static cppchar_t
 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
 			 unsigned int *pchars_seen, int *unsignedp,
-			 enum cpp_ttype type)
+			 const cpp_token *token)
 {
+  enum cpp_ttype type = token->type;
   size_t width = CPP_OPTION (pfile, char_precision);
   size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
   size_t mask = width_to_mask (width);
   size_t i;
   cppchar_t result, c;
   bool unsigned_p;
+  bool diagnosed = false;
 
   /* The value of a multi-character character constant, or a
      single-character character constant whose representation in the
@@ -2612,11 +2716,55 @@ narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
 
   if (type == CPP_UTF8CHAR)
     max_chars = 1;
-  if (i > max_chars)
+  else if (i > 1 && CPP_OPTION (pfile, cplusplus) && CPP_PEDANTIC (pfile))
     {
+      /* C++ as a DR since
+	 P1854R4 - Making non-encodable string literals ill-formed
+	 makes multi-character narrow character literals if any of the
+	 characters in the literal isn't encodable in char/unsigned char
+	 ill-formed.  We need to count the number of c-chars and compare
+	 that to str.len.  */
+      unsigned src_chars = count_source_chars (pfile, token->val.str, type);
+
+      if (src_chars)
+	{
+	  if (str.len > src_chars)
+	    {
+	      if (src_chars <= 2)
+		diagnosed
+		  = cpp_error (pfile, CPP_DL_PEDWARN,
+			       "character not encodable in a single execution "
+			       "character code unit");
+	      else
+		diagnosed
+		  = cpp_error (pfile, CPP_DL_PEDWARN,
+			       "at least one character in a multi-character "
+			       "literal not encodable in a single execution "
+			       "character code unit");
+	      if (diagnosed && i > max_chars)
+		i = max_chars;
+	    }
+	}
+    }
+  if (diagnosed)
+    /* Already diagnosed above.  */;
+  else if (i > max_chars)
+    {
+      unsigned src_chars
+	= count_source_chars (pfile, token->val.str,
+			      type == CPP_UTF8CHAR ? CPP_CHAR : type);
+
+      if (type != CPP_UTF8CHAR)
+	cpp_error (pfile, CPP_DL_WARNING,
+		   "multi-character literal with %ld characters exceeds "
+		   "'int' size of %ld bytes", (long) i, (long) max_chars);
+      else if (src_chars > 2)
+	cpp_error (pfile, CPP_DL_ERROR,
+		   "multi-character literal cannot have an encoding prefix");
+      else
+	cpp_error (pfile, CPP_DL_ERROR,
+		   "character not encodable in a single code unit");
       i = max_chars;
-      cpp_error (pfile, type == CPP_UTF8CHAR ? CPP_DL_ERROR : CPP_DL_WARNING,
-		 "character constant too long for its type");
     }
   else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
     cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant");
@@ -2651,12 +2799,13 @@ narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
 /* Subroutine of cpp_interpret_charconst which performs the conversion
    to a number, for wide strings.  STR is the string structure returned
    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
-   cpp_interpret_charconst.  TYPE is the token type.  */
+   cpp_interpret_charconst.  TOKEN is the token.  */
 static cppchar_t
 wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
 		       unsigned int *pchars_seen, int *unsignedp,
-		       enum cpp_ttype type)
+		       const cpp_token *token)
 {
+  enum cpp_ttype type = token->type;
   bool bigend = CPP_OPTION (pfile, bytes_big_endian);
   size_t width = converter_for_type (pfile, type).width;
   size_t cwidth = CPP_OPTION (pfile, char_precision);
@@ -2692,14 +2841,25 @@ wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
      character exactly fills a wchar_t, so a multi-character wide
      character constant is guaranteed to overflow.  */
   if (str.len > nbwc * 2)
-    cpp_error (pfile, (CPP_OPTION (pfile, cplusplus)
-		       && (type == CPP_CHAR16
-			   || type == CPP_CHAR32
-			   /* In C++23 this is error even for L'ab'.  */
-			   || (type == CPP_WCHAR
-			       && CPP_OPTION (pfile, size_t_literals))))
-		      ? CPP_DL_ERROR : CPP_DL_WARNING,
-	       "character constant too long for its type");
+    {
+      cpp_diagnostic_level level = CPP_DL_WARNING;
+      unsigned src_chars
+	= count_source_chars (pfile, token->val.str, CPP_CHAR);
+
+      if (CPP_OPTION (pfile, cplusplus)
+	  && (type == CPP_CHAR16
+	      || type == CPP_CHAR32
+	      /* In C++23 this is error even for L'ab'.  */
+	      || (type == CPP_WCHAR
+		  && CPP_OPTION (pfile, size_t_literals))))
+	level = CPP_DL_ERROR;
+      if (src_chars > 2)
+	cpp_error (pfile, level,
+		   "multi-character literal cannot have an encoding prefix");
+      else
+	cpp_error (pfile, level,
+		   "character not encodable in a single code unit");
+    }
 
   /* Truncate the constant to its natural width, and simultaneously
      sign- or zero-extend to the full width of cppchar_t.  */
@@ -2754,10 +2914,10 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
 
   if (wide)
     result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
-				    token->type);
+				    token);
   else
     result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp,
-				      token->type);
+				      token);
 
   if (str.text != token->val.str.text)
     free ((void *)str.text);
author	Jakub Jelinek <jakub@redhat.com>	2023-11-14 18:28:34 +0100
committer	Jakub Jelinek <jakub@redhat.com>	2023-11-14 18:28:34 +0100
commit	194825f20619a1c4b51eaea84f20432fefc0db03 (patch)
tree	93e0f44cfa40ba14f7585d7aee9464f25b3f15e7 /libcpp/charset.cc
parent	948b8b6e0e50958ecf56d4d9fb7ac16f245d9cc3 (diff)
download	gcc-194825f20619a1c4b51eaea84f20432fefc0db03.zip gcc-194825f20619a1c4b51eaea84f20432fefc0db03.tar.gz gcc-194825f20619a1c4b51eaea84f20432fefc0db03.tar.bz2