1 files changed, 112 insertions, 45 deletions
diff --git a/libcpp/charset.c b/libcpp/charset.c
index 3e5578b..b84a974 100644
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -1,5 +1,5 @@
 /* CPP Library - charsets
-   Copyright (C) 1998-2020 Free Software Foundation, Inc.
+   Copyright (C) 1998-2021 Free Software Foundation, Inc.
 
    Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
 
@@ -630,7 +630,11 @@ static const struct cpp_conversion conversion_tab[] = {
    cset_converter structure for conversion from FROM to TO.  If
    iconv_open() fails, issue an error and return an identity
    converter.  Silently return an identity converter if FROM and TO
-   are identical.  */
+   are identical.
+
+   PFILE is only used for generating diagnostics; setting it to NULL
+   suppresses diagnostics.  */
+
 static struct cset_converter
 init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
 {
@@ -672,25 +676,31 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
 
       if (ret.cd == (iconv_t) -1)
 	{
-	  if (errno == EINVAL)
-	    cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
-		       "conversion from %s to %s not supported by iconv",
-		       from, to);
-	  else
-	    cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
-
+	  if (pfile)
+	    {
+	      if (errno == EINVAL)
+		cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
+			   "conversion from %s to %s not supported by iconv",
+			   from, to);
+	      else
+		cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
+	    }
 	  ret.func = convert_no_conversion;
 	}
     }
   else
     {
-      cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
-		 "no iconv implementation, cannot convert from %s to %s",
-		 from, to);
+      if (pfile)
+	{
+	  cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
+		     "no iconv implementation, cannot convert from %s to %s",
+		     from, to);
+	}
       ret.func = convert_no_conversion;
       ret.cd = (iconv_t) -1;
       ret.width = -1;
     }
+
   return ret;
 }
 
@@ -884,14 +894,18 @@ enum {
   C11 = 8,
   /* Valid in a C11/C++11 identifier, but not as the first character?  */
   N11 = 16,
+  /* Valid in a C++23 identifier?  */
+  CXX23 = 32,
+  /* Valid in a C++23 identifier, but not as the first character?  */
+  NXX23 = 64,
   /* NFC representation is not valid in an identifier?  */
-  CID = 32,
+  CID = 128,
   /* Might be valid NFC form?  */
-  NFC = 64,
+  NFC = 256,
   /* Might be valid NFKC form?  */
-  NKC = 128,
+  NKC = 512,
   /* Certain preceding characters might make it not valid NFC/NKFC form?  */
-  CTX = 256
+  CTX = 1024
 };
 
 struct ucnrange {
@@ -938,10 +952,12 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
   /* When -pedantic, we require the character to have been listed by
      the standard for the current language.  Otherwise, we accept the
      union of the acceptable sets for all supported language versions.  */
-  valid_flags = C99 | CXX | C11;
+  valid_flags = C99 | CXX | C11 | CXX23;
   if (CPP_PEDANTIC (pfile))
     {
-      if (CPP_OPTION (pfile, c11_identifiers))
+      if (CPP_OPTION (pfile, cxx23_identifiers))
+	valid_flags = CXX23;
+      else if (CPP_OPTION (pfile, c11_identifiers))
 	valid_flags = C11;
       else if (CPP_OPTION (pfile, c99))
 	valid_flags = C99;
@@ -950,12 +966,6 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
     }
   if (! (ucnranges[mn].flags & valid_flags))
       return 0;
-  if (CPP_OPTION (pfile, c11_identifiers))
-    invalid_start_flags = N11;
-  else if (CPP_OPTION (pfile, c99))
-    invalid_start_flags = N99;
-  else
-    invalid_start_flags = 0;
 
   /* Update NST.  */
   if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
@@ -998,6 +1008,28 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
     nst->previous = c;
   nst->prev_class = ucnranges[mn].combine;
 
+  if (!CPP_PEDANTIC (pfile))
+    {
+      /* If not -pedantic, accept as character that may
+	 begin an identifier a union of characters allowed
+	 at that position in each of the character sets.  */
+      if ((ucnranges[mn].flags & (C99 | N99)) == C99
+	  || (ucnranges[mn].flags & CXX) != 0
+	  || (ucnranges[mn].flags & (C11 | N11)) == C11
+	  || (ucnranges[mn].flags & (CXX23 | NXX23)) == CXX23)
+	return 1;
+      return 2;
+    }
+
+  if (CPP_OPTION (pfile, cxx23_identifiers))
+    invalid_start_flags = NXX23;
+  else if (CPP_OPTION (pfile, c11_identifiers))
+    invalid_start_flags = N11;
+  else if (CPP_OPTION (pfile, c99))
+    invalid_start_flags = N99;
+  else
+    invalid_start_flags = 0;
+
   /* In C99, UCN digits may not begin identifiers.  In C11 and C++11,
      UCN combining characters may not begin identifiers.  */
   if (ucnranges[mn].flags & invalid_start_flags)
@@ -2122,6 +2154,25 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
 				  buf, bufp - buf, HT_ALLOC));
 }
 
+
+/* Utility to strip a UTF-8 byte order marking from the beginning
+   of a buffer.  Returns the number of bytes to skip, which currently
+   will be either 0 or 3.  */
+int
+cpp_check_utf8_bom (const char *data, size_t data_length)
+{
+
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+  const unsigned char *udata = (const unsigned char *) data;
+  if (data_length >= 3 && udata[0] == 0xef && udata[1] == 0xbb
+      && udata[2] == 0xbf)
+    return 3;
+#endif
+
+  return 0;
+}
+
+
 /* Convert an input buffer (containing the complete contents of one
    source file) from INPUT_CHARSET to the source character set.  INPUT
    points to the input buffer, SIZE is its allocated size, and LEN is
@@ -2135,7 +2186,11 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
    INPUT is expected to have been allocated with xmalloc.  This
    function will either set *BUFFER_START to INPUT, or free it and set
    *BUFFER_START to a pointer to another xmalloc-allocated block of
-   memory.  */
+   memory.
+
+   PFILE is only used to generate diagnostics; setting it to NULL suppresses
+   diagnostics, and causes a return of NULL if there was any error instead.  */
+
 uchar * 
 _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
 		    uchar *input, size_t size, size_t len,
@@ -2158,17 +2213,27 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
       to.text = XNEWVEC (uchar, to.asize);
       to.len = 0;
 
-      if (!APPLY_CONVERSION (input_cset, input, len, &to))
-	cpp_error (pfile, CPP_DL_ERROR,
-		   "failure to convert %s to %s",
-		   CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
-
+      const bool ok = APPLY_CONVERSION (input_cset, input, len, &to);
       free (input);
-    }
 
-  /* Clean up the mess.  */
-  if (input_cset.func == convert_using_iconv)
-    iconv_close (input_cset.cd);
+      /* Clean up the mess.  */
+      if (input_cset.func == convert_using_iconv)
+	iconv_close (input_cset.cd);
+
+      /* Handle conversion failure.  */
+      if (!ok)
+	{
+	  if (!pfile)
+	    {
+	      XDELETEVEC (to.text);
+	      *buffer_start = NULL;
+	      *st_size = 0;
+	      return NULL;
+	    }
+	  cpp_error (pfile, CPP_DL_ERROR, "failure to convert %s to %s",
+		     input_charset, SOURCE_CHARSET);
+	}
+    }
 
   /* Resize buffer if we allocated substantially too much, or if we
      haven't enough space for the \n-terminator or following
@@ -2192,19 +2257,14 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
 
   buffer = to.text;
   *st_size = to.len;
-#if HOST_CHARSET == HOST_CHARSET_ASCII
-  /* The HOST_CHARSET test just above ensures that the source charset
-     is UTF-8.  So, ignore a UTF-8 BOM if we see one.  Note that
-     glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
+
+  /* Ignore a UTF-8 BOM if we see one and the source charset is UTF-8.  Note
+     that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
      BOM -- however, even if it did, we would still need this code due
      to the 'convert_no_conversion' case.  */
-  if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
-      && to.text[2] == 0xbf)
-    {
-      *st_size -= 3;
-      buffer += 3;
-    }
-#endif
+  const int bom_len = cpp_check_utf8_bom ((const char *) to.text, to.len);
+  *st_size -= bom_len;
+  buffer += bom_len;
 
   *buffer_start = to.text;
   return buffer;
@@ -2244,6 +2304,13 @@ _cpp_default_encoding (void)
   return current_encoding;
 }
 
+/* Check if the configured input charset requires no conversion, other than
+   possibly stripping a UTF-8 BOM.  */
+bool cpp_input_conversion_is_trivial (const char *input_charset)
+{
+  return !strcasecmp (input_charset, SOURCE_CHARSET);
+}
+
 /* Implementation of class cpp_string_location_reader.  */
 
 /* Constructor for cpp_string_location_reader.  */