From 4805b92a32637b987f924463d6af9dcf95b21f63 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 5 Aug 2021 17:34:16 +0200 Subject: libcpp: Fix makeucnid bug with combining values [PR100977] I've noticed in ucnid.h two adjacent lines that had all flags and combine values identical and as such were supposed to be merged. This is due to a bug in makeucnid.c, which records last_flag, last_combine and really_safe of what has just been printed, but because of a typo mishandles it for last_combine, always compares against the combining_value[0] which is 0. This has two effects on the table, one is that often the table is unnecessarily large, as for non-zero .combine every character has its own record instead of adjacent characters with the same flags and combine being merged. This means larger tables. The other is that sometimes the last char that has combine set doesn't actually have it in the tables, because the code is printing entries only upon seeing the next character and if that character does have combining_value of 0 and flags are otherwise the same as previously printed, it will not print anything. The following patch fixes that, for clarity what exactly it affects I've regenerated with the same Unicode files as last time it has been regenerated. 2021-08-05 Jakub Jelinek PR c++/100977 * makeucnid.c (write_table): Fix computation of last_combine. * ucnid.h: Regenerated using Unicode 6.3.0 files. --- libcpp/makeucnid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'libcpp/makeucnid.c') diff --git a/libcpp/makeucnid.c b/libcpp/makeucnid.c index 66d251d..d2c8d58 100644 --- a/libcpp/makeucnid.c +++ b/libcpp/makeucnid.c @@ -274,7 +274,7 @@ write_table (void) combining_value[i - 1], i - 1); last_flag = flags[i]; - last_combine = combining_value[0]; + last_combine = combining_value[i]; really_safe = decomp[i][0] == 0; } -- cgit v1.1 From c4d6dcacfca1b804504515496e6d9de176d7f51e Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 1 Sep 2021 22:33:06 +0200 Subject: libcpp: Implement C++23 P1949R7 - C++ Identifier Syntax using Unicode Standard Annex 31 The following patch implements the P1949R7 - C++ Identifier Syntax using Unicode Standard Annex 31 paper. We already allow UTF-8 characters in the source, so that part is already implemented, so IMHO all we need to do is pedwarn instead of just warn for the (default) -Wnormalize=nfc (or for -Wnormalize={id,nkfc}) if the character is not in NFC and to use the unicode XID_Start and XID_Continue derived code properties to find out what characters are allowed (the standard actually adds U+005F to XID_Start, but we are handling the ASCII compatible characters differently already and they aren't allowed in UCNs in identifiers). Instead of hardcoding the large tables in ucnid.tab, this patch makes makeucnid.c read them from the Unicode tables (13.0.0 version at this point). For non-pedantic mode, we accept as 2nd+ char in identifiers a union of valid characters in all supported modes, but for the 1st char it was actually pedantically requiring that it is not any of the characters that may not appear in the currently chosen standard as the first character. This patch changes it such that also what is allowed at the start of an identifier is a union of characters valid at the start of an identifier in any of the pedantic modes. 2021-09-01 Jakub Jelinek PR c++/100977 libcpp/ * include/cpplib.h (struct cpp_options): Add cxx23_identifiers. * charset.c (CXX23, NXX23): New enumerators. (CID, NFC, NKC, CTX): Renumber. (ucn_valid_in_identifier): Implement P1949R7 - use CXX23 and NXX23 flags for cxx23_identifiers. For start character in non-pedantic mode, allow characters that are allowed as start characters in any of the supported language modes, rather than disallowing characters allowed only as non-start characters in current mode but for characters from other language modes allowing them even if they are never allowed at start. * init.c (struct lang_flags): Add cxx23_identifiers. (lang_defaults): Add cxx23_identifiers column. (cpp_set_lang): Initialize CPP_OPTION (pfile, cxx23_identifiers). * lex.c (warn_about_normalization): If cxx23_identifiers, use cpp_pedwarning_with_line instead of cpp_warning_with_line for "is not in NFC" diagnostics. * makeucnid.c: Adjust usage comment. (CXX23, NXX23): New enumerators. (all_languages): Add CXX23. (not_NFC, not_NFKC, maybe_not_NFC): Renumber. (read_derivedcore): New function. (write_table): Print also CXX23 and NXX23 columns. (main): Require 5 arguments instead of 4, call read_derivedcore. * ucnid.h: Regenerated using Unicode 13.0.0 files. gcc/testsuite/ * g++.dg/cpp23/normalize1.C: New test. * g++.dg/cpp23/normalize2.C: New test. * g++.dg/cpp23/normalize3.C: New test. * g++.dg/cpp23/normalize4.C: New test. * g++.dg/cpp23/normalize5.C: New test. * g++.dg/cpp23/normalize6.C: New test. * g++.dg/cpp23/normalize7.C: New test. * g++.dg/cpp23/ucnid-1-utf8.C: New test. * g++.dg/cpp23/ucnid-2-utf8.C: New test. * gcc.dg/cpp/ucnid-4.c: Don't expect "not valid at the start of an identifier" errors. * gcc.dg/cpp/ucnid-4-utf8.c: Likewise. * gcc.dg/cpp/ucnid-5-utf8.c: New test. --- libcpp/makeucnid.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 7 deletions(-) (limited to 'libcpp/makeucnid.c') diff --git a/libcpp/makeucnid.c b/libcpp/makeucnid.c index d2c8d58..b3a0aab 100644 --- a/libcpp/makeucnid.c +++ b/libcpp/makeucnid.c @@ -17,7 +17,7 @@ along with this program; see the file COPYING3. If not see /* Run this program as ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \ - > ucnid.h + DerivedCoreProperties.txt > ucnid.h */ #include @@ -32,10 +32,12 @@ enum { N99 = 4, C11 = 8, N11 = 16, - all_languages = C99 | CXX | C11, - not_NFC = 32, - not_NFKC = 64, - maybe_not_NFC = 128 + CXX23 = 32, + NXX23 = 64, + all_languages = C99 | CXX | C11 | CXX23 | NXX23, + not_NFC = 128, + not_NFKC = 256, + maybe_not_NFC = 512 }; #define NUM_CODE_POINTS 0x110000 @@ -241,6 +243,74 @@ read_derived (const char *fname) fclose (f); } +/* Read DerivedCoreProperties.txt and fill in languages version in + flags from the XID_Start and XID_Continue properties. */ + +static void +read_derivedcore (char *fname) +{ + FILE * f = fopen (fname, "r"); + + if (!f) + fail ("opening DerivedCoreProperties.txt"); + for (;;) + { + char line[256]; + unsigned long codepoint_start, codepoint_end; + char *l; + int i, j; + + if (!fgets (line, sizeof (line), f)) + break; + if (line[0] == '#' || line[0] == '\n' || line[0] == '\r') + continue; + codepoint_start = strtoul (line, &l, 16); + if (l == line) + fail ("parsing DerivedCoreProperties.txt, reading code point"); + if (codepoint_start > MAX_CODE_POINT) + fail ("parsing DerivedCoreProperties.txt, code point too large"); + + if (*l == '.' && l[1] == '.') + { + char *l2 = l + 2; + codepoint_end = strtoul (l + 2, &l, 16); + if (l == l2 || codepoint_end < codepoint_start) + fail ("parsing DerivedCoreProperties.txt, reading code point"); + if (codepoint_end > MAX_CODE_POINT) + fail ("parsing DerivedCoreProperties.txt, code point too large"); + } + else + codepoint_end = codepoint_start; + + while (*l == ' ') + l++; + if (*l++ != ';') + fail ("parsing DerivedCoreProperties.txt, reading code point"); + + while (*l == ' ') + l++; + + if (codepoint_end < 0x80) + continue; + + if (strncmp (l, "XID_Start ", 10) == 0) + { + for (; codepoint_start <= codepoint_end; codepoint_start++) + flags[codepoint_start] + = (flags[codepoint_start] | CXX23) & ~NXX23; + } + else if (strncmp (l, "XID_Continue ", 13) == 0) + { + for (; codepoint_start <= codepoint_end; codepoint_start++) + if ((flags[codepoint_start] & CXX23) == 0) + flags[codepoint_start] |= CXX23 | NXX23; + } + } + if (ferror (f)) + fail ("reading DerivedCoreProperties.txt"); + fclose (f); +} + /* Write out the table. The table consists of two words per entry. The first word is the flags for the unicode code points up to and including the second word. */ @@ -261,12 +331,14 @@ write_table (void) || really_safe != (decomp[i][0] == 0) || combining_value[i] != last_combine) { - printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", + printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", last_flag & C99 ? "C99" : " 0", last_flag & N99 ? "N99" : " 0", last_flag & CXX ? "CXX" : " 0", last_flag & C11 ? "C11" : " 0", last_flag & N11 ? "N11" : " 0", + last_flag & CXX23 ? "CXX23" : " 0", + last_flag & NXX23 ? "NXX23" : " 0", really_safe ? "CID" : " 0", last_flag & not_NFC ? " 0" : "NFC", last_flag & not_NFKC ? " 0" : "NKC", @@ -439,11 +511,12 @@ write_copyright (void) int main(int argc, char ** argv) { - if (argc != 4) + if (argc != 5) fail ("too few arguments to makeucn"); read_ucnid (argv[1]); read_table (argv[2]); read_derived (argv[3]); + read_derivedcore (argv[4]); write_copyright (); write_table (); -- cgit v1.1