aboutsummaryrefslogtreecommitdiff
path: root/libcpp/charset.cc
diff options
context:
space:
mode:
authorJakub Jelinek <jakub@redhat.com>2022-08-26 09:24:56 +0200
committerJakub Jelinek <jakub@redhat.com>2022-08-26 09:27:39 +0200
commiteb4879ab9053085a59b8d1594ef76487948bba7e (patch)
treeea5328515c81dd4505284ce2cd0aa37ebaa56b40 /libcpp/charset.cc
parent670961f051aedbac21bc769c21c5b28b338b6003 (diff)
downloadgcc-eb4879ab9053085a59b8d1594ef76487948bba7e.zip
gcc-eb4879ab9053085a59b8d1594ef76487948bba7e.tar.gz
gcc-eb4879ab9053085a59b8d1594ef76487948bba7e.tar.bz2
c++: Implement C++23 P2071R2 - Named universal character escapes [PR106648]
The following patch implements the C++23 P2071R2 - Named universal character escapes paper to support \N{LATIN SMALL LETTER E} etc. I've used Unicode 14.0, there are 144803 character name properties (including the ones generated by Unicode NR1 and NR2 rules) and correction/control/alternate aliases, together with zero terminators that would be 3884745 bytes, which is clearly unacceptable for libcpp. This patch instead contains a generator which from the UnicodeData.txt and NameAliases.txt files emits a space optimized radix tree (208765 bytes long for 14.0), a single string literal dictionary (59418 bytes), maximum name length (currently 88 chars) and two small helper arrays for the NR1/NR2 name generation. The radix tree needs 2 to 9 bytes per node, the exact format is described in the generator program. There could be ways to shrink the dictionary size somewhat at the expense of slightly slower lookups. Currently the patch implements strict matching (that is what is needed to actually implement it on valid code) and Unicode UAX44-LM2 algorithm loose matching to provide hints (that algorithm essentially ignores hyphens in between two alphanumeric characters, spaces and underscores (with one exception for hyphen) and does case insensitive matching). In the attachment is a WIP patch that shows how to implement also spellcheck.{h,cc} style discovery of misspellings, but I'll need to talk to David Malcolm about it, as spellcheck.{h,cc} is in gcc/ subdir (so the WIP incremental patch instead prints all the names to stderr). 2022-08-26 Jakub Jelinek <jakub@redhat.com> PR c++/106648 libcpp/ * charset.cc: Implement C++23 P2071R2 - Named universal character escapes. Include uname2c.h. (hangul_syllables, hangul_count): New variables. (struct uname2c_data): New type. (_cpp_uname2c, _cpp_uname2c_uax44_lm2): New functions. (_cpp_valid_ucn): Use them. Handle named universal character escapes. (convert_ucn): Adjust comment. (convert_escape): Call convert_ucn even for \N. (_cpp_interpret_identifier): Handle named universal character escapes. * lex.cc (get_bidi_ucn): Fix up function comment formatting. (get_bidi_named): New function. (forms_identifier_p, lex_string): Handle named universal character escapes. * makeuname2c.cc: New file. Small parts copied from makeucnid.cc. * uname2c.h: New generated file. gcc/c-family/ * c-cppbuiltin.cc (c_cpp_builtins): Predefine __cpp_named_character_escapes to 202207L. gcc/testsuite/ * c-c++-common/cpp/named-universal-char-escape-1.c: New test. * c-c++-common/cpp/named-universal-char-escape-2.c: New test. * c-c++-common/cpp/named-universal-char-escape-3.c: New test. * c-c++-common/cpp/named-universal-char-escape-4.c: New test. * c-c++-common/Wbidi-chars-25.c: New test. * gcc.dg/cpp/named-universal-char-escape-1.c: New test. * gcc.dg/cpp/named-universal-char-escape-2.c: New test. * g++.dg/cpp/named-universal-char-escape-1.C: New test. * g++.dg/cpp/named-universal-char-escape-2.C: New test. * g++.dg/cpp23/feat-cxx2b.C: Test __cpp_named_character_escapes.
Diffstat (limited to 'libcpp/charset.cc')
-rw-r--r--libcpp/charset.cc535
1 files changed, 487 insertions, 48 deletions
diff --git a/libcpp/charset.cc b/libcpp/charset.cc
index cf4a525..47a36d8 100644
--- a/libcpp/charset.cc
+++ b/libcpp/charset.cc
@@ -921,6 +921,342 @@ struct ucnrange {
/* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
#define UCS_LIMIT 0x10FFFF
+#include "uname2c.h"
+
+static const char hangul_syllables[][4] = {
+ /* L */
+ "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "",
+ "J", "JJ", "C", "K", "T", "P", "H",
+ /* V */
+ "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE",
+ "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I",
+ /* T */
+ "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB",
+ "LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C",
+ "K", "T", "P", "H"
+};
+
+static const short hangul_count[6] = { 19, 21, 28 };
+
+/* Used for Unicode loose matching rule UAX44-LM2 matching. */
+
+struct uname2c_data
+{
+ char *canon_name;
+ char prev_char;
+};
+
+/* Map NAME, a Unicode character name or correction/control/alternate
+ alias, to a Unicode codepoint, or return (cppchar_t) -1 if
+ not found. This uses a space optimized radix tree precomputed
+ by the makeuname2c utility, with binary format documented in its
+ source makeuname2c.cc. */
+
+static cppchar_t
+_cpp_uname2c (const char *name, size_t len, const unsigned char *n,
+ struct uname2c_data *data)
+{
+ do
+ {
+ char k;
+ const char *key;
+ size_t key_len, len_adj;
+ bool has_value = *n & 0x40;
+ bool has_children, no_sibling = false;
+ cppchar_t codepoint = -1;
+ const unsigned char *child = NULL;
+ int ret;
+
+ if (*n & 0x80)
+ {
+ k = ' ' + (*n++ & 0x3f);
+ key = &k;
+ key_len = 1;
+ }
+ else
+ {
+ key_len = *n++ & 0x3f;
+ key = &uname2c_dict[*n++];
+ key += (*n++ << 8);
+ }
+ if (has_value)
+ {
+ codepoint = *n + (n[1] << 8) + ((n[2] & 0x1f) << 16);
+ has_children = n[2] & 0x80;
+ no_sibling = n[2] & 0x40;
+ n += 3;
+ }
+ else
+ has_children = true;
+ if (has_children)
+ {
+ unsigned int shift = 0;
+ size_t child_off = 0;
+
+ do
+ {
+ child_off |= (*n & 0x7f) << shift;
+ shift += 7;
+ }
+ while ((*n++ & 0x80) != 0);
+ child = n + child_off;
+ }
+ if (__builtin_expect (data == NULL, 1))
+ {
+ ret = memcmp (name, key, len > key_len ? key_len : len);
+ len_adj = key_len;
+ }
+ else
+ {
+ const char *p = name, *q = key;
+
+ while (1)
+ {
+ if ((size_t) (p - name) == len || (size_t) (q - key) == key_len)
+ break;
+ if (*q == ' ')
+ {
+ ++q;
+ continue;
+ }
+ if (*q == '-')
+ {
+ /* This is the hard case. Only medial hyphens
+ should be removed, where medial means preceded
+ and followed by alnum. */
+ if (ISALNUM (q == key ? data->prev_char : q[-1]))
+ {
+ if (q + 1 == key + key_len)
+ {
+ /* We don't know what the next letter will be.
+ It could be ISALNUM, then we are supposed
+ to omit it, or it could be a space and then
+ we should not omit it and need to compare it.
+ Fortunately the only 3 names with hyphen
+ followed by non-letter are
+ U+0F0A TIBETAN MARK BKA- SHOG YIG MGO
+ U+0FD0 TIBETAN MARK BKA- SHOG GI MGO RGYAN
+ U+0FD0 TIBETAN MARK BSKA- SHOG GI MGO RGYAN
+ Furthermore, prefixes of NR2 generated
+ ranges all end with a hyphen, but the generated
+ part is then followed by alpha-numeric.
+ So, let's just assume that - at the end of
+ key is always followed by alphanumeric and
+ so should be omitted.
+ makeuname2c.cc verifies that this is true. */
+ ++q;
+ continue;
+ }
+ else if (ISALNUM (q[1]))
+ {
+ ++q;
+ continue;
+ }
+ }
+ }
+ if (*p != *q)
+ break;
+ ++p;
+ ++q;
+ }
+ len_adj = p - name;
+ /* If we don't consume the whole key, signal a mismatch,
+ but always with ret = 1, so that we keep looking through
+ siblings. */
+ ret = q < key + key_len;
+ }
+ if (ret < 0)
+ return -1;
+ else if (ret == 0)
+ {
+ if (len < len_adj)
+ return -1;
+ else if (codepoint >= 0xd800
+ && codepoint < 0xd800 + ARRAY_SIZE (uname2c_generated))
+ {
+ name += len_adj;
+ len -= len_adj;
+ if (codepoint == 0xd800)
+ {
+ /* NR1 - Hangul syllables. */
+ size_t start = 0, end, i, j;
+ int this_len, max_len;
+ char winner[3];
+
+ for (i = 0; i < 3; ++i)
+ {
+ end = start + hangul_count[i];
+ max_len = -1;
+ winner[i] = -1;
+ for (j = start; j < end; j++)
+ {
+ this_len = strlen (hangul_syllables[j]);
+ if (len >= (size_t) this_len
+ && this_len > max_len
+ && memcmp (name, hangul_syllables[j],
+ this_len) == 0)
+ {
+ max_len = this_len;
+ winner[i] = j - start;
+ }
+ }
+ if (max_len == -1)
+ return -1;
+ name += max_len;
+ len -= max_len;
+ start = end;
+ }
+ if (__builtin_expect (data != NULL, 0))
+ {
+ memcpy (data->canon_name, key, key_len);
+ data->canon_name[key_len] = '\0';
+ for (i = 0, start = 0; i < 3; ++i)
+ {
+ strcat (data->canon_name,
+ hangul_syllables[start + winner[i]]);
+ start += hangul_count[i];
+ }
+ }
+ return (0xac00 + 21 * 28 * winner[0]
+ + 28 * winner[1] + winner[2]);
+ }
+ else
+ {
+ /* NR2 - prefix followed by hexadecimal codepoint. */
+ const cppchar_t *p;
+ size_t i;
+
+ if (len < 4 || len > 5)
+ return -1;
+ p = uname2c_pairs + uname2c_generated[codepoint - 0xd800];
+ codepoint = 0;
+ for (i = 0; i < len; ++i)
+ {
+ codepoint <<= 4;
+ if (!ISXDIGIT (name[i]))
+ return -1;
+ codepoint += hex_value (name[i]);
+ }
+ for (; *p; p += 2)
+ if (codepoint < *p)
+ return -1;
+ else if (codepoint <= p[1])
+ {
+ if (__builtin_expect (data != NULL, 0))
+ {
+ memcpy (data->canon_name, key, key_len);
+ memcpy (data->canon_name + key_len, name, len);
+ data->canon_name[key_len + len] = '\0';
+ }
+ return codepoint;
+ }
+ return -1;
+ }
+ }
+ else if (__builtin_expect (data != NULL, 0))
+ {
+ if (len == len_adj)
+ {
+ memcpy (data->canon_name, key, key_len);
+ data->canon_name[key_len] = '\0';
+ return codepoint;
+ }
+ if (has_children)
+ {
+ struct uname2c_data save = *data;
+ memcpy (data->canon_name, key, key_len);
+ data->canon_name += key_len;
+ data->prev_char = key[key_len - 1];
+ codepoint = _cpp_uname2c (name + len_adj, len - len_adj,
+ child, data);
+ if (codepoint != (cppchar_t) -1)
+ return codepoint;
+ *data = save;
+ }
+ }
+ else if (len == len_adj)
+ return codepoint;
+ else if (!has_children)
+ return -1;
+ else
+ {
+ name += len_adj;
+ len -= len_adj;
+ n = child;
+ continue;
+ }
+ }
+ if (no_sibling || (!has_value && *n == 0xff))
+ break;
+ }
+ while (1);
+ return -1;
+}
+
+/* Try to do a loose name lookup according to Unicode loose matching rule
+ UAX44-LM2. First ignore medial hyphens, whitespace, underscore
+ characters and convert to upper case. */
+
+static cppchar_t
+_cpp_uname2c_uax44_lm2 (const char *name, size_t len, char *canon_name)
+{
+ char name_after_uax44_lm2[uname2c_max_name_len];
+ char *q = name_after_uax44_lm2;
+ const char *p;
+
+ for (p = name; p < name + len; p++)
+ if (*p == '_' || *p == ' ')
+ continue;
+ else if (*p == '-' && p != name && ISALNUM (p[-1]) && ISALNUM (p[1]))
+ continue;
+ else if (q == name_after_uax44_lm2 + uname2c_max_name_len)
+ return -1;
+ else if (ISLOWER (*p))
+ *q++ = TOUPPER (*p);
+ else
+ *q++ = *p;
+
+ struct uname2c_data data;
+ data.canon_name = canon_name;
+ data.prev_char = ' ';
+ /* Hangul Jungseong O- E after UAX44-LM2 should be HANGULJUNGSEONGO-E
+ and so should match U+1180. */
+ if (q - name_after_uax44_lm2 == sizeof ("HANGULJUNGSEONGO-E") - 1
+ && memcmp (name_after_uax44_lm2, "HANGULJUNGSEONGO-E",
+ sizeof ("HANGULJUNGSEONGO-E") - 1) == 0)
+ {
+ name_after_uax44_lm2[sizeof ("HANGULJUNGSEONGO") - 1] = 'E';
+ --q;
+ }
+ cppchar_t result
+ = _cpp_uname2c (name_after_uax44_lm2, q - name_after_uax44_lm2,
+ uname2c_tree, &data);
+
+ /* Unicode UAX44-LM2 exception:
+ U+116C HANGUL JUNGSEONG OE
+ U+1180 HANGUL JUNGSEONG O-E
+ We remove all medial hyphens when we shouldn't remote the U+1180 one.
+ The U+1180 entry sorts before U+116C lexicographilly, so we get U+1180
+ in both cases. Thus, if result is U+1180, check if user's name doesn't
+ have a hyphen there and adjust. */
+ if (result == 0x1180)
+ {
+ while (p[-1] == ' ' || p[-1] == '_')
+ --p;
+ gcc_assert (TOUPPER (p[-1]) == 'E');
+ --p;
+ while (p[-1] == ' ' || p[-1] == '_')
+ --p;
+ if (p[-1] != '-')
+ {
+ result = 0x116c;
+ memcpy (canon_name + sizeof ("HANGUL JUNGSEONG O") - 1, "E", 2);
+ }
+ }
+ return result;
+}
+
+
/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
the start of an identifier, and 0 if C is not valid in an
identifier. We assume C has already gone through the checks of
@@ -1094,7 +1430,7 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
unsigned int length;
const uchar *str = *pstr;
const uchar *base = str - 2;
- bool delimited = false;
+ bool delimited = false, named = false;
if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
cpp_error (pfile, CPP_DL_WARNING,
@@ -1108,6 +1444,7 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
"the meaning of '\\%c' is different in traditional C",
(int) str[-1]);
+ result = 0;
if (str[-1] == 'u')
{
length = 4;
@@ -1122,44 +1459,130 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
}
else if (str[-1] == 'U')
length = 8;
- else
+ else if (str[-1] == 'N')
{
- cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
length = 4;
- }
-
- result = 0;
- do
- {
- if (str == limit)
- break;
- c = *str;
- if (!ISXDIGIT (c))
- break;
- str++;
- extend_char_range (char_range, loc_reader);
- if (delimited)
+ if (str == limit || *str != '{')
+ cpp_error (pfile, CPP_DL_ERROR, "'\\N' not followed by '{'");
+ else
{
- if (!result)
- /* Accept arbitrary number of leading zeros.
- 16 is another magic value, smaller than 32 above
- and bigger than 8, so that upon encountering first
- non-zero digit we can count 8 digits and after that
- or in overflow bit and ensure length doesn't decrease
- to 0, as delimited escape sequence doesn't have upper
- bound on the number of hex digits. */
- length = 16;
- else if (length == 16 - 8)
+ str++;
+ named = true;
+ extend_char_range (char_range, loc_reader);
+ length = 0;
+ const uchar *name = str;
+ bool strict = true;
+
+ do
{
- /* Make sure we detect overflows. */
- result |= 0x8000000;
- ++length;
+ if (str == limit)
+ break;
+ c = *str;
+ if (!ISIDNUM (c) && c != ' ' && c != '-')
+ break;
+ if (ISLOWER (c) || c == '_')
+ strict = false;
+ str++;
+ extend_char_range (char_range, loc_reader);
}
- }
+ while (1);
- result = (result << 4) + hex_value (c);
+ if (str < limit && *str == '}')
+ {
+ if (name == str && identifier_pos)
+ {
+ *cp = 0;
+ return false;
+ }
+ if (name == str)
+ cpp_error (pfile, CPP_DL_ERROR,
+ "empty named universal character escape sequence");
+ else if (!CPP_OPTION (pfile, delimited_escape_seqs)
+ && CPP_OPTION (pfile, cpp_pedantic))
+ cpp_error (pfile, CPP_DL_PEDWARN,
+ "named universal character escapes are only valid "
+ "in C++23");
+ if (name == str)
+ result = 0x40;
+ else
+ {
+ /* If the name is longer than maximum length of a Unicode
+ name, it can't be strictly valid. */
+ if ((size_t) (str - name) > uname2c_max_name_len || !strict)
+ result = -1;
+ else
+ result = _cpp_uname2c ((const char *) name, str - name,
+ uname2c_tree, NULL);
+ if (result == (cppchar_t) -1)
+ {
+ cpp_error (pfile, CPP_DL_ERROR,
+ "\\N{%.*s} is not a valid universal "
+ "character", (int) (str - name), name);
+
+ /* Try to do a loose name lookup according to
+ Unicode loose matching rule UAX44-LM2. */
+ char canon_name[uname2c_max_name_len + 1];
+ result = _cpp_uname2c_uax44_lm2 ((const char *) name,
+ str - name, canon_name);
+ if (result != (cppchar_t) -1)
+ cpp_error (pfile, CPP_DL_NOTE,
+ "did you mean \\N{%s}?", canon_name);
+ else
+ result = 0x40;
+ }
+ }
+ str++;
+ extend_char_range (char_range, loc_reader);
+ }
+ else if (identifier_pos)
+ length = 1;
+ else
+ {
+ cpp_error (pfile, CPP_DL_ERROR,
+ "'\\N{' not terminated with '}' after %.*s",
+ (int) (str - base), base);
+ result = 1;
+ }
+ }
}
- while (--length);
+ else
+ {
+ cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
+ length = 4;
+ }
+
+ if (!named)
+ do
+ {
+ if (str == limit)
+ break;
+ c = *str;
+ if (!ISXDIGIT (c))
+ break;
+ str++;
+ extend_char_range (char_range, loc_reader);
+ if (delimited)
+ {
+ if (!result)
+ /* Accept arbitrary number of leading zeros.
+ 16 is another magic value, smaller than 32 above
+ and bigger than 8, so that upon encountering first
+ non-zero digit we can count 8 digits and after that
+ or in overflow bit and ensure length doesn't decrease
+ to 0, as delimited escape sequence doesn't have upper
+ bound on the number of hex digits. */
+ length = 16;
+ else if (length == 16 - 8)
+ {
+ /* Make sure we detect overflows. */
+ result |= 0x8000000;
+ ++length;
+ }
+ }
+
+ result = (result << 4) + hex_value (c);
+ }
+ while (--length);
if (delimited
&& str < limit
@@ -1274,7 +1697,7 @@ convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
/* loc_reader and ranges must either be both NULL, or both be non-NULL. */
gcc_assert ((loc_reader != NULL) == (ranges != NULL));
- from++; /* Skip u/U. */
+ from++; /* Skip u/U/N. */
/* The u/U is part of the spelling of this character. */
extend_char_range (&char_range, loc_reader);
@@ -1665,7 +2088,7 @@ convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
switch (c)
{
/* UCNs, hex escapes, and octal escapes are processed separately. */
- case 'u': case 'U':
+ case 'u': case 'U': case 'N':
return convert_ucn (pfile, from, limit, tbuf, cvt,
char_range, loc_reader, ranges);
@@ -2256,31 +2679,47 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
*bufp++ = id[idp];
else
{
- unsigned length = id[idp+1] == 'u' ? 4 : 8;
+ unsigned length = id[idp + 1] == 'u' ? 4 : 8;
cppchar_t value = 0;
size_t bufleft = len - (bufp - buf);
int rval;
bool delimited = false;
idp += 2;
- if (length == 4 && id[idp] == '{')
+ if (id[idp - 1] == 'N' && id[idp] == '{')
{
- delimited = true;
idp++;
+ const uchar *name = &id[idp];
+ while (idp < len
+ && (ISIDNUM (id[idp]) || id[idp] == ' ' || id[idp] == '-'))
+ idp++;
+ if (id[idp] == '}')
+ {
+ value = _cpp_uname2c ((const char *) name, &id[idp] - name,
+ uname2c_tree, NULL);
+ if (value == (cppchar_t) -1)
+ value = 1;
+ }
+ else
+ idp--;
}
- while (length && idp < len && ISXDIGIT (id[idp]))
+ else
{
- value = (value << 4) + hex_value (id[idp]);
- idp++;
- if (!delimited)
- length--;
+ if (length == 4 && id[idp] == '{')
+ {
+ delimited = true;
+ idp++;
+ }
+ while (length && idp < len && ISXDIGIT (id[idp]))
+ {
+ value = (value << 4) + hex_value (id[idp]);
+ idp++;
+ if (!delimited)
+ length--;
+ }
+ if (!delimited || id[idp] != '}')
+ idp--;
}
- if (!delimited)
- idp--;
- /* else
- assert (id[idp] == '}');
- As the caller ensures it is a valid identifier, if it is
- delimited escape sequence, it must be terminated by }. */
/* Special case for EBCDIC: if the identifier contains
a '$' specified using a UCN, translate it to EBCDIC. */