c++: Implement C++23 P2071R2 - Named universal character escapes [PR106648]

The following patch implements the C++23 P2071R2 - Named universal character escapes paper to support \N{LATIN SMALL LETTER E} etc. I've used Unicode 14.0, there are 144803 character name properties (including the ones generated by Unicode NR1 and NR2 rules) and correction/control/alternate aliases, together with zero terminators that would be 3884745 bytes, which is clearly unacceptable for libcpp. This patch instead contains a generator which from the UnicodeData.txt and NameAliases.txt files emits a space optimized radix tree (208765 bytes long for 14.0), a single string literal dictionary (59418 bytes), maximum name length (currently 88 chars) and two small helper arrays for the NR1/NR2 name generation. The radix tree needs 2 to 9 bytes per node, the exact format is described in the generator program. There could be ways to shrink the dictionary size somewhat at the expense of slightly slower lookups. Currently the patch implements strict matching (that is what is needed to actually implement it on valid code) and Unicode UAX44-LM2 algorithm loose matching to provide hints (that algorithm essentially ignores hyphens in between two alphanumeric characters, spaces and underscores (with one exception for hyphen) and does case insensitive matching). In the attachment is a WIP patch that shows how to implement also spellcheck.{h,cc} style discovery of misspellings, but I'll need to talk to David Malcolm about it, as spellcheck.{h,cc} is in gcc/ subdir (so the WIP incremental patch instead prints all the names to stderr). 2022-08-26 Jakub Jelinek <jakub@redhat.com> PR c++/106648 libcpp/ * charset.cc: Implement C++23 P2071R2 - Named universal character escapes. Include uname2c.h. (hangul_syllables, hangul_count): New variables. (struct uname2c_data): New type. (_cpp_uname2c, _cpp_uname2c_uax44_lm2): New functions. (_cpp_valid_ucn): Use them. Handle named universal character escapes. (convert_ucn): Adjust comment. (convert_escape): Call convert_ucn even for \N. (_cpp_interpret_identifier): Handle named universal character escapes. * lex.cc (get_bidi_ucn): Fix up function comment formatting. (get_bidi_named): New function. (forms_identifier_p, lex_string): Handle named universal character escapes. * makeuname2c.cc: New file. Small parts copied from makeucnid.cc. * uname2c.h: New generated file. gcc/c-family/ * c-cppbuiltin.cc (c_cpp_builtins): Predefine __cpp_named_character_escapes to 202207L. gcc/testsuite/ * c-c++-common/cpp/named-universal-char-escape-1.c: New test. * c-c++-common/cpp/named-universal-char-escape-2.c: New test. * c-c++-common/cpp/named-universal-char-escape-3.c: New test. * c-c++-common/cpp/named-universal-char-escape-4.c: New test. * c-c++-common/Wbidi-chars-25.c: New test. * gcc.dg/cpp/named-universal-char-escape-1.c: New test. * gcc.dg/cpp/named-universal-char-escape-2.c: New test. * g++.dg/cpp/named-universal-char-escape-1.C: New test. * g++.dg/cpp/named-universal-char-escape-2.C: New test. * g++.dg/cpp23/feat-cxx2b.C: Test __cpp_named_character_escapes.
author: Jakub Jelinek <jakub@redhat.com> 2022-08-26 09:24:56 +0200
committer: Jakub Jelinek <jakub@redhat.com> 2022-08-26 09:27:39 +0200
commit: eb4879ab9053085a59b8d1594ef76487948bba7e (patch)
tree: ea5328515c81dd4505284ce2cd0aa37ebaa56b40 /libcpp/lex.cc
parent: 670961f051aedbac21bc769c21c5b28b338b6003 (diff)
download: gcc-eb4879ab9053085a59b8d1594ef76487948bba7e.zip
gcc-eb4879ab9053085a59b8d1594ef76487948bba7e.tar.gz
gcc-eb4879ab9053085a59b8d1594ef76487948bba7e.tar.bz2
1 files changed, 66 insertions, 9 deletions
diff --git a/libcpp/lex.cc b/libcpp/lex.cc
index 571cd2a..528d598 100644
--- a/libcpp/lex.cc
+++ b/libcpp/lex.cc
@@ -1512,7 +1512,7 @@ get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
 }
 
 /* Parse a UCN where P points just past \u or \U and return its bidi code.
-   If the kind is not NONE, write the location to *OUT.*/
+   If the kind is not NONE, write the location to *OUT.  */
 
 static bidi::kind
 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
@@ -1529,6 +1529,56 @@ get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
   return result;
 }
 
+/* Parse a named universal character escape where P points just past \N and
+   return its bidi code.  If the kind is not NONE, write the location to
+   *OUT.  */
+
+static bidi::kind
+get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
+{
+  bidi::kind result = bidi::kind::NONE;
+  if (*p != '{')
+    return bidi::kind::NONE;
+  if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
+    {
+      if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
+	result = bidi::kind::LTR;
+      else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
+	result = bidi::kind::LRE;
+      else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
+	result = bidi::kind::LRO;
+      else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
+	result = bidi::kind::LRI;
+    }
+  else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
+    {
+      if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
+	result = bidi::kind::RTL;
+      else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
+	result = bidi::kind::RLE;
+      else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
+	result = bidi::kind::RLO;
+      else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
+	result = bidi::kind::RLI;
+    }
+  else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
+    {
+      if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
+	result = bidi::kind::PDF;
+      else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
+	result = bidi::kind::PDI;
+    }
+  else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
+    result = bidi::kind::FSI;
+  if (result != bidi::kind::NONE)
+    *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
+						    (strchr ((const char *)
+							     (p + 1), '}')
+						     - (const char *) p)
+						    + 3);
+  return result;
+}
+
 /* Subclass of rich_location for reporting on unpaired UTF-8
    bidirectional control character(s).
    Escape the source lines on output, and show all unclosed
@@ -1914,16 +1964,20 @@ forms_identifier_p (cpp_reader *pfile, int first,
 	    return true;
 	}
       else if (*buffer->cur == '\\'
-	       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
+	       && (buffer->cur[1] == 'u'
+		   || buffer->cur[1] == 'U'
+		   || buffer->cur[1] == 'N'))
 	{
 	  buffer->cur += 2;
 	  if (warn_bidi_p)
 	    {
 	      location_t loc;
-	      bidi::kind kind = get_bidi_ucn (pfile,
-					      buffer->cur,
-					      buffer->cur[-1] == 'U',
-					      &loc);
+	      bidi::kind kind;
+	      if (buffer->cur[-1] == 'N')
+		kind = get_bidi_named (pfile, buffer->cur, &loc);
+	      else
+		kind = get_bidi_ucn (pfile, buffer->cur,
+				     buffer->cur[-1] == 'U', &loc);
 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
 	    }
 	  if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
@@ -2657,11 +2711,14 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
       /* In #include-style directives, terminators are not escapable.  */
       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
 	{
-	  if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
+	  if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
 	    {
 	      location_t loc;
-	      bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
-					      &loc);
+	      bidi::kind kind;
+	      if (cur[0] == 'N')
+		kind = get_bidi_named (pfile, cur + 1, &loc);
+	      else
+		kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
 	    }
 	  cur++;
author	Jakub Jelinek <jakub@redhat.com>	2022-08-26 09:24:56 +0200
committer	Jakub Jelinek <jakub@redhat.com>	2022-08-26 09:27:39 +0200
commit	eb4879ab9053085a59b8d1594ef76487948bba7e (patch)
tree	ea5328515c81dd4505284ce2cd0aa37ebaa56b40 /libcpp/lex.cc
parent	670961f051aedbac21bc769c21c5b28b338b6003 (diff)
download	gcc-eb4879ab9053085a59b8d1594ef76487948bba7e.zip gcc-eb4879ab9053085a59b8d1594ef76487948bba7e.tar.gz gcc-eb4879ab9053085a59b8d1594ef76487948bba7e.tar.bz2