Support extended characters in C/C++ identifiers (PR c/67224)

libcpp/ChangeLog 2019-09-19 Lewis Hyatt <lhyatt@gmail.com> PR c/67224 * charset.c (_cpp_valid_utf8): New function to help lex UTF-8 tokens. * internal.h (_cpp_valid_utf8): Declare. * lex.c (forms_identifier_p): Use it to recognize UTF-8 identifiers. (_cpp_lex_direct): Handle UTF-8 in identifiers and CPP_OTHER tokens. Do all work in "default" case to avoid slowing down typical code paths. Also handle $ and UCN in the default case for consistency. gcc/Changelog 2019-09-19 Lewis Hyatt <lhyatt@gmail.com> PR c/67224 * doc/cpp.texi: Document support for extended characters in identifiers. * doc/cppopts.texi: Likewise. gcc/testsuite/ChangeLog 2019-09-19 Lewis Hyatt <lhyatt@gmail.com> PR c/67224 * c-c++-common/cpp/ucnid-2011-1-utf8.c: New test. * g++.dg/cpp/ucnid-1-utf8.C: New test. * g++.dg/cpp/ucnid-2-utf8.C: New test. * g++.dg/cpp/ucnid-3-utf8.C: New test. * g++.dg/cpp/ucnid-4-utf8.C: New test. * g++.dg/other/ucnid-1-utf8.C: New test. * gcc.dg/cpp/ucnid-1-utf8.c: New test. * gcc.dg/cpp/ucnid-10-utf8.c: New test. * gcc.dg/cpp/ucnid-11-utf8.c: New test. * gcc.dg/cpp/ucnid-12-utf8.c: New test. * gcc.dg/cpp/ucnid-13-utf8.c: New test. * gcc.dg/cpp/ucnid-14-utf8.c: New test. * gcc.dg/cpp/ucnid-15-utf8.c: New test. * gcc.dg/cpp/ucnid-2-utf8.c: New test. * gcc.dg/cpp/ucnid-3-utf8.c: New test. * gcc.dg/cpp/ucnid-4-utf8.c: New test. * gcc.dg/cpp/ucnid-6-utf8.c: New test. * gcc.dg/cpp/ucnid-7-utf8.c: New test. * gcc.dg/cpp/ucnid-9-utf8.c: New test. * gcc.dg/ucnid-1-utf8.c: New test. * gcc.dg/ucnid-10-utf8.c: New test. * gcc.dg/ucnid-11-utf8.c: New test. * gcc.dg/ucnid-12-utf8.c: New test. * gcc.dg/ucnid-13-utf8.c: New test. * gcc.dg/ucnid-14-utf8.c: New test. * gcc.dg/ucnid-15-utf8.c: New test. * gcc.dg/ucnid-16-utf8.c: New test. * gcc.dg/ucnid-2-utf8.c: New test. * gcc.dg/ucnid-3-utf8.c: New test. * gcc.dg/ucnid-4-utf8.c: New test. * gcc.dg/ucnid-5-utf8.c: New test. * gcc.dg/ucnid-6-utf8.c: New test. * gcc.dg/ucnid-7-utf8.c: New test. * gcc.dg/ucnid-8-utf8.c: New test. * gcc.dg/ucnid-9-utf8.c: New test. From-SVN: r275979
author: Lewis Hyatt <lhyatt@gmail.com> 2019-09-19 19:56:11 +0000
committer: Joseph Myers <jsm28@gcc.gnu.org> 2019-09-19 20:56:11 +0100
commit: 7d112d6670a0e0e662f8a7e64c33686e475832c8 (patch)
tree: 983eb23217b2572ff4fe5a7f7fe0e5c0c0b9a48d /libcpp/lex.c
parent: e0710fcf7dc70054a9a20ab1b8d77f4fef26ef2c (diff)
download: gcc-7d112d6670a0e0e662f8a7e64c33686e475832c8.zip
gcc-7d112d6670a0e0e662f8a7e64c33686e475832c8.tar.gz
gcc-7d112d6670a0e0e662f8a7e64c33686e475832c8.tar.bz2
1 files changed, 37 insertions, 18 deletions
diff --git a/libcpp/lex.c b/libcpp/lex.c
index 52e5bce..0e8de38 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -1313,7 +1313,9 @@ warn_about_normalization (cpp_reader *pfile,
     }
 }
 
-/* Returns TRUE if the sequence starting at buffer->cur is invalid in
+static const cppchar_t utf8_signifier = 0xC0;
+
+/* Returns TRUE if the sequence starting at buffer->cur is valid in
    an identifier.  FIRST is TRUE if this starts an identifier.  */
 static bool
 forms_identifier_p (cpp_reader *pfile, int first,
@@ -1336,17 +1338,25 @@ forms_identifier_p (cpp_reader *pfile, int first,
       return true;
     }
 
-  /* Is this a syntactically valid UCN?  */
-  if (CPP_OPTION (pfile, extended_identifiers)
-      && *buffer->cur == '\\'
-      && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
+  /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
+  if (CPP_OPTION (pfile, extended_identifiers))
     {
       cppchar_t s;
-      buffer->cur += 2;
-      if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
-			  state, &s, NULL, NULL))
-	return true;
-      buffer->cur -= 2;
+      if (*buffer->cur >= utf8_signifier)
+	{
+	  if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
+			       state, &s))
+	    return true;
+	}
+      else if (*buffer->cur == '\\'
+	       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
+	{
+	  buffer->cur += 2;
+	  if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
+			      state, &s, NULL, NULL))
+	    return true;
+	  buffer->cur -= 2;
+	}
     }
 
   return false;
@@ -1464,7 +1474,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
   pfile->buffer->cur = cur;
   if (starts_ucn || forms_identifier_p (pfile, false, nst))
     {
-      /* Slower version for identifiers containing UCNs (or $).  */
+      /* Slower version for identifiers containing UCNs
+	 or extended chars (including $).  */
       do {
 	while (ISIDNUM (*pfile->buffer->cur))
 	  {
@@ -3123,12 +3134,12 @@ _cpp_lex_direct (cpp_reader *pfile)
       /* @ is a punctuator in Objective-C.  */
     case '@': result->type = CPP_ATSIGN; break;
 
-    case '$':
-    case '\\':
+    default:
       {
 	const uchar *base = --buffer->cur;
-	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
 
+	/* Check for an extended identifier ($ or UCN or UTF-8).  */
+	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
 	if (forms_identifier_p (pfile, true, &nst))
 	  {
 	    result->type = CPP_NAME;
@@ -3137,13 +3148,21 @@ _cpp_lex_direct (cpp_reader *pfile)
 	    warn_about_normalization (pfile, result, &nst);
 	    break;
 	  }
+
+	/* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
+	   single token.  */
 	buffer->cur++;
+	if (c >= utf8_signifier)
+	  {
+	    const uchar *pstr = base;
+	    cppchar_t s;
+	    if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
+	      buffer->cur = pstr;
+	  }
+	create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
+	break;
       }
-      /* FALLTHRU */
 
-    default:
-      create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
-      break;
     }
 
   /* Potentially convert the location of the token to a range.  */
author	Lewis Hyatt <lhyatt@gmail.com>	2019-09-19 19:56:11 +0000
committer	Joseph Myers <jsm28@gcc.gnu.org>	2019-09-19 20:56:11 +0100
commit	7d112d6670a0e0e662f8a7e64c33686e475832c8 (patch)
tree	983eb23217b2572ff4fe5a7f7fe0e5c0c0b9a48d /libcpp/lex.c
parent	e0710fcf7dc70054a9a20ab1b8d77f4fef26ef2c (diff)
download	gcc-7d112d6670a0e0e662f8a7e64c33686e475832c8.zip gcc-7d112d6670a0e0e662f8a7e64c33686e475832c8.tar.gz gcc-7d112d6670a0e0e662f8a7e64c33686e475832c8.tar.bz2