diff options
author | Jakub Jelinek <jakub@redhat.com> | 2022-09-07 08:44:38 +0200 |
---|---|---|
committer | Jakub Jelinek <jakub@redhat.com> | 2022-09-07 08:44:38 +0200 |
commit | 572f5e1bc68e131b25cd2d5ba231e932f5038904 (patch) | |
tree | 3bb768b7f06160f88ca0aaa8f0e2f5df0408ab7e | |
parent | ea6e89e07f4223c8ac7877508c62bba368084999 (diff) | |
download | gcc-572f5e1bc68e131b25cd2d5ba231e932f5038904.zip gcc-572f5e1bc68e131b25cd2d5ba231e932f5038904.tar.gz gcc-572f5e1bc68e131b25cd2d5ba231e932f5038904.tar.bz2 |
libcpp: Named universal character escapes and delimited escape sequence tweaks
On Tue, Aug 30, 2022 at 09:10:37PM +0000, Joseph Myers wrote:
> I'm seeing build failures of glibc for powerpc64, as illustrated by the
> following C code:
>
> #if 0
> \NARG
> #endif
>
> (the actual sysdeps/powerpc/powerpc64/sysdep.h code is inside #ifdef
> __ASSEMBLER__).
>
> This shows some problems with this feature - and with delimited escape
> sequences - as it affects C. It's fine to accept it as an extension
> inside string and character literals, because \N or \u{...} would be
> invalid in the absence of the feature (i.e. the syntax for such literals
> fails to match, meaning that the rule about undefined behavior for a
> single ' or " as a pp-token applies). But outside string and character
> literals, the usual lexing rules apply, the \ is a pp-token on its own and
> the code is valid at the preprocessing level, and with expansion of macros
> appearing before or after the \ (e.g. u defined as a macro in the \u{...}
> case) it may be valid code at the language level as well. I don't know
> what older C++ versions say about this, but for C this means e.g.
>
> #define z(x) 0
> #define a z(
> int x = a\NARG);
>
> needs to be accepted as expanding to "int x = 0;", not interpreted as
> using the \N feature in an identifier and produce an error.
The following patch changes this, so that:
1) outside of string/character literals, \N without following { is never
treated as an error nor warning, it is silently treated as \ separate
token followed by whatever is after it
2) \u{123} and \N{LATIN SMALL LETTER A WITH ACUTE} are not handled as
extension at all outside of string/character literals in the strict
standard modes (-std=c*) except for -std=c++{23,2b}, only in the
-std=gnu* modes, because it changes behavior on valid sources, e.g.
#define z(x) 0
#define a z(
int x = a\u{123});
int y = a\N{LATIN SMALL LETTER A WITH ACUTE});
3) introduces -Wunicode warning (on by default) and warns for cases
of what looks like invalid delimited escape sequence or named
universal character escape outside of string/character literals
and is treated as separate tokens
2022-09-07 Jakub Jelinek <jakub@redhat.com>
libcpp/
* include/cpplib.h (struct cpp_options): Add cpp_warn_unicode member.
(enum cpp_warning_reason): Add CPP_W_UNICODE.
* init.cc (cpp_create_reader): Initialize cpp_warn_unicode.
* charset.cc (_cpp_valid_ucn): In possible identifier contexts, don't
handle \u{ or \N{ specially in -std=c* modes except -std=c++2{3,b}.
In possible identifier contexts, don't emit an error and punt
if \N isn't followed by {, or if \N{} surrounds some lower case
letters or _. In possible identifier contexts when not C++23, don't
emit an error but warning about unknown character names and treat as
separate tokens. When treating as separate tokens \u{ or \N{, emit
warnings.
gcc/
* doc/invoke.texi (-Wno-unicode): Document.
gcc/c-family/
* c.opt (Winvalid-utf8): Use ObjC instead of objC. Remove
" in comments" from description.
(Wunicode): New option.
gcc/testsuite/
* c-c++-common/cpp/delimited-escape-seq-4.c: New test.
* c-c++-common/cpp/delimited-escape-seq-5.c: New test.
* c-c++-common/cpp/delimited-escape-seq-6.c: New test.
* c-c++-common/cpp/delimited-escape-seq-7.c: New test.
* c-c++-common/cpp/named-universal-char-escape-5.c: New test.
* c-c++-common/cpp/named-universal-char-escape-6.c: New test.
* c-c++-common/cpp/named-universal-char-escape-7.c: New test.
* g++.dg/cpp23/named-universal-char-escape1.C: New test.
* g++.dg/cpp23/named-universal-char-escape2.C: New test.
-rw-r--r-- | gcc/c-family/c.opt | 8 | ||||
-rw-r--r-- | gcc/doc/invoke.texi | 8 | ||||
-rw-r--r-- | gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-4.c | 13 | ||||
-rw-r--r-- | gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-5.c | 13 | ||||
-rw-r--r-- | gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-6.c | 13 | ||||
-rw-r--r-- | gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-7.c | 13 | ||||
-rw-r--r-- | gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-5.c | 17 | ||||
-rw-r--r-- | gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-6.c | 17 | ||||
-rw-r--r-- | gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-7.c | 17 | ||||
-rw-r--r-- | gcc/testsuite/g++.dg/cpp23/named-universal-char-escape1.C | 16 | ||||
-rw-r--r-- | gcc/testsuite/g++.dg/cpp23/named-universal-char-escape2.C | 18 | ||||
-rw-r--r-- | libcpp/charset.cc | 86 | ||||
-rw-r--r-- | libcpp/include/cpplib.h | 7 | ||||
-rw-r--r-- | libcpp/init.cc | 1 |
14 files changed, 228 insertions, 19 deletions
diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt index 4515664..1c7f89e 100644 --- a/gcc/c-family/c.opt +++ b/gcc/c-family/c.opt @@ -822,8 +822,8 @@ C ObjC C++ ObjC++ CPP(warn_invalid_pch) CppReason(CPP_W_INVALID_PCH) Var(cpp_war Warn about PCH files that are found but not used. Winvalid-utf8 -C objC C++ ObjC++ CPP(cpp_warn_invalid_utf8) CppReason(CPP_W_INVALID_UTF8) Var(warn_invalid_utf8) Init(0) Warning -Warn about invalid UTF-8 characters in comments. +C ObjC C++ ObjC++ CPP(cpp_warn_invalid_utf8) CppReason(CPP_W_INVALID_UTF8) Var(warn_invalid_utf8) Init(0) Warning +Warn about invalid UTF-8 characters. Wjump-misses-init C ObjC Var(warn_jump_misses_init) Warning LangEnabledby(C ObjC,Wc++-compat) @@ -1345,6 +1345,10 @@ Wundef C ObjC C++ ObjC++ CPP(warn_undef) CppReason(CPP_W_UNDEF) Var(cpp_warn_undef) Init(0) Warning Warn if an undefined macro is used in an #if directive. +Wunicode +C ObjC C++ ObjC++ CPP(cpp_warn_unicode) CppReason(CPP_W_UNICODE) Var(warn_unicode) Init(1) Warning +Warn about invalid forms of delimited or named escape sequences. + Wuninitialized C ObjC C++ ObjC++ LTO LangEnabledBy(C ObjC C++ ObjC++ LTO,Wall) ; diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 9d662e3..cc631df 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -365,7 +365,7 @@ Objective-C and Objective-C++ Dialects}. -Winfinite-recursion @gol -Winit-self -Winline -Wno-int-conversion -Wint-in-bool-context @gol -Wno-int-to-pointer-cast -Wno-invalid-memory-model @gol --Winvalid-pch -Winvalid-utf8 -Wjump-misses-init @gol +-Winvalid-pch -Winvalid-utf8 -Wno-unicode -Wjump-misses-init @gol -Wlarger-than=@var{byte-size} -Wlogical-not-parentheses -Wlogical-op @gol -Wlong-long -Wno-lto-type-mismatch -Wmain -Wmaybe-uninitialized @gol -Wmemset-elt-size -Wmemset-transposed-args @gol @@ -9578,6 +9578,12 @@ Warn if an invalid UTF-8 character is found. This warning is on by default for C++23 if @option{-finput-charset=UTF-8} is used and turned into error with @option{-pedantic-errors}. +@item -Wno-unicode +@opindex Wunicode +@opindex Wno-unicode +Don't diagnose invalid forms of delimited or named escape sequences which are +treated as separate tokens. @option{Wunicode} is enabled by default. + @item -Wlong-long @opindex Wlong-long @opindex Wno-long-long diff --git a/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-4.c b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-4.c new file mode 100644 index 0000000..107051f --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-4.c @@ -0,0 +1,13 @@ +/* P2290R3 - Delimited escape sequences */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */ +/* { dg-options "-std=gnu++20" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\u{}); /* { dg-warning "empty delimited escape sequence; treating it as separate tokens" } */ +int c = a\u{); /* { dg-warning "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{; treating it as separate tokens" } */ +int d = a\u{12XYZ}); /* { dg-warning "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{12; treating it as separate tokens" } */ +int e = a\u123); +int f = a\U1234567); diff --git a/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-5.c b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-5.c new file mode 100644 index 0000000..e04f519 --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-5.c @@ -0,0 +1,13 @@ +/* P2290R3 - Delimited escape sequences */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=c17 -Wno-c++-compat" { target c } } */ +/* { dg-options "-std=c++23" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\u{}); /* { dg-warning "empty delimited escape sequence; treating it as separate tokens" "" { target c++23 } } */ +int c = a\u{); /* { dg-warning "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{; treating it as separate tokens" "" { target c++23 } } */ +int d = a\u{12XYZ}); /* { dg-warning "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{12; treating it as separate tokens" "" { target c++23 } } */ +int e = a\u123); +int f = a\U1234567); diff --git a/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-6.c b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-6.c new file mode 100644 index 0000000..f2a4e93 --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-6.c @@ -0,0 +1,13 @@ +/* P2290R3 - Delimited escape sequences */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat -Wno-unicode" { target c } } */ +/* { dg-options "-std=gnu++20 -Wno-unicode" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\u{}); /* { dg-bogus "empty delimited escape sequence; treating it as separate tokens" } */ +int c = a\u{); /* { dg-bogus "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{; treating it as separate tokens" } */ +int d = a\u{12XYZ}); /* { dg-bogus "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{12; treating it as separate tokens" } */ +int e = a\u123); +int f = a\U1234567); diff --git a/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-7.c b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-7.c new file mode 100644 index 0000000..e2f0da4 --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-7.c @@ -0,0 +1,13 @@ +/* P2290R3 - Delimited escape sequences */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=c17 -Wno-c++-compat -Wno-unicode" { target c } } */ +/* { dg-options "-std=c++23 -Wno-unicode" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\u{}); /* { dg-bogus "empty delimited escape sequence; treating it as separate tokens" } */ +int c = a\u{); /* { dg-bogus "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{; treating it as separate tokens" } */ +int d = a\u{12XYZ}); /* { dg-bogus "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{12; treating it as separate tokens" } */ +int e = a\u123); +int f = a\U1234567); diff --git a/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-5.c b/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-5.c new file mode 100644 index 0000000..a1c53c7 --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-5.c @@ -0,0 +1,17 @@ +/* P2071R2 - Named universal character escapes */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */ +/* { dg-options "-std=gnu++20" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\N{}); /* { dg-warning "empty named universal character escape sequence; treating it as separate tokens" } */ +int c = a\N{); /* { dg-warning "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{; treating it as separate tokens" } */ +int d = a\N); +int e = a\NARG); +int f = a\N{abc}); /* { dg-warning "\\\\N\\\{abc\\\} is not a valid universal character; treating it as separate tokens" } */ +int g = a\N{ABC.123}); /* { dg-warning "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{ABC; treating it as separate tokens" } */ +int h = a\N{NON-EXISTENT CHAR}); /* { dg-warning "\\\\N\\\{NON-EXISTENT CHAR\\\} is not a valid universal character; treating it as separate tokens" } */ +int i = a\N{Latin_Small_Letter_A_With_Acute}); /* { dg-warning "\\\\N\\\{Latin_Small_Letter_A_With_Acute\\\} is not a valid universal character; treating it as separate tokens" } */ + /* { dg-message "did you mean \\\\N\\\{LATIN SMALL LETTER A WITH ACUTE\\\}\\?" "" { target *-*-* } .-1 } */ diff --git a/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-6.c b/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-6.c new file mode 100644 index 0000000..a6a5a10 --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-6.c @@ -0,0 +1,17 @@ +/* P2071R2 - Named universal character escapes */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=c17 -Wno-c++-compat" { target c } } */ +/* { dg-options "-std=c++20" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\N{}); +int c = a\N{); +int d = a\N); +int e = a\NARG); +int f = a\N{abc}); +int g = a\N{ABC.123}); +int h = a\N{NON-EXISTENT CHAR}); /* { dg-bogus "is not a valid universal character" } */ +int i = a\N{Latin_Small_Letter_A_With_Acute}); +int j = a\N{LATIN SMALL LETTER A WITH ACUTE}); diff --git a/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-7.c b/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-7.c new file mode 100644 index 0000000..e6142bf --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-7.c @@ -0,0 +1,17 @@ +/* P2071R2 - Named universal character escapes */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat -Wno-unicode" { target c } } */ +/* { dg-options "-std=gnu++20 -Wno-unicode" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\N{}); /* { dg-bogus "empty named universal character escape sequence; treating it as separate tokens" } */ +int c = a\N{); /* { dg-bogus "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{; treating it as separate tokens" } */ +int d = a\N); +int e = a\NARG); +int f = a\N{abc}); /* { dg-bogus "\\\\N\\\{abc\\\} is not a valid universal character; treating it as separate tokens" } */ +int g = a\N{ABC.123}); /* { dg-bogus "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{ABC; treating it as separate tokens" } */ +int h = a\N{NON-EXISTENT CHAR}); /* { dg-bogus "\\\\N\\\{NON-EXISTENT CHAR\\\} is not a valid universal character; treating it as separate tokens" } */ +int i = a\N{Latin_Small_Letter_A_With_Acute}); /* { dg-bogus "\\\\N\\\{Latin_Small_Letter_A_With_Acute\\\} is not a valid universal character; treating it as separate tokens" } */ + /* { dg-bogus "did you mean \\\\N\\\{LATIN SMALL LETTER A WITH ACUTE\\\}\\?" "" { target *-*-* } .-1 } */ diff --git a/gcc/testsuite/g++.dg/cpp23/named-universal-char-escape1.C b/gcc/testsuite/g++.dg/cpp23/named-universal-char-escape1.C new file mode 100644 index 0000000..fe49482 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/named-universal-char-escape1.C @@ -0,0 +1,16 @@ +// P2071R2 - Named universal character escapes +// { dg-do compile } +// { dg-require-effective-target wchar } + +#define z(x) 0 +#define a z( +int b = a\N{}); // { dg-warning "empty named universal character escape sequence; treating it as separate tokens" "" { target c++23 } } +int c = a\N{); // { dg-warning "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{; treating it as separate tokens" "" { target c++23 } } +int d = a\N); +int e = a\NARG); +int f = a\N{abc}); // { dg-warning "\\\\N\\\{abc\\\} is not a valid universal character; treating it as separate tokens" "" { target c++23 } } +int g = a\N{ABC.123}); // { dg-warning "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{ABC; treating it as separate tokens" "" { target c++23 } } +int h = a\N{NON-EXISTENT CHAR}); // { dg-error "is not a valid universal character" "" { target c++23 } } + // { dg-error "was not declared in this scope" "" { target c++23 } .-1 } +int i = a\N{Latin_Small_Letter_A_With_Acute}); // { dg-warning "\\\\N\\\{Latin_Small_Letter_A_With_Acute\\\} is not a valid universal character; treating it as separate tokens" "" { target c++23 } } + // { dg-message "did you mean \\\\N\\\{LATIN SMALL LETTER A WITH ACUTE\\\}\\?" "" { target c++23 } .-1 } diff --git a/gcc/testsuite/g++.dg/cpp23/named-universal-char-escape2.C b/gcc/testsuite/g++.dg/cpp23/named-universal-char-escape2.C new file mode 100644 index 0000000..8699e09 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/named-universal-char-escape2.C @@ -0,0 +1,18 @@ +// P2071R2 - Named universal character escapes +// { dg-do compile } +// { dg-require-effective-target wchar } +// { dg-options "" } + +#define z(x) 0 +#define a z( +int b = a\N{}); // { dg-warning "empty named universal character escape sequence; treating it as separate tokens" } +int c = a\N{); // { dg-warning "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{; treating it as separate tokens" } +int d = a\N); +int e = a\NARG); +int f = a\N{abc}); // { dg-warning "\\\\N\\\{abc\\\} is not a valid universal character; treating it as separate tokens" } +int g = a\N{ABC.123}); // { dg-warning "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{ABC; treating it as separate tokens" } +int h = a\N{NON-EXISTENT CHAR}); // { dg-error "is not a valid universal character" "" { target c++23 } } + // { dg-error "was not declared in this scope" "" { target c++23 } .-1 } + // { dg-warning "\\\\N\\\{NON-EXISTENT CHAR\\\} is not a valid universal character; treating it as separate tokens" "" { target c++20_down } .-2 } +int i = a\N{Latin_Small_Letter_A_With_Acute}); // { dg-warning "\\\\N\\\{Latin_Small_Letter_A_With_Acute\\\} is not a valid universal character; treating it as separate tokens" } + // { dg-message "did you mean \\\\N\\\{LATIN SMALL LETTER A WITH ACUTE\\\}\\?" "" { target *-*-* } .-1 } diff --git a/libcpp/charset.cc b/libcpp/charset.cc index c9656db..6834969 100644 --- a/libcpp/charset.cc +++ b/libcpp/charset.cc @@ -1448,7 +1448,11 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, if (str[-1] == 'u') { length = 4; - if (str < limit && *str == '{') + if (str < limit + && *str == '{' + && (!identifier_pos + || CPP_OPTION (pfile, delimited_escape_seqs) + || !CPP_OPTION (pfile, std))) { str++; /* Magic value to indicate no digits seen. */ @@ -1462,8 +1466,22 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, else if (str[-1] == 'N') { length = 4; + if (identifier_pos + && !CPP_OPTION (pfile, delimited_escape_seqs) + && CPP_OPTION (pfile, std)) + { + *cp = 0; + return false; + } if (str == limit || *str != '{') - cpp_error (pfile, CPP_DL_ERROR, "'\\N' not followed by '{'"); + { + if (identifier_pos) + { + *cp = 0; + return false; + } + cpp_error (pfile, CPP_DL_ERROR, "'\\N' not followed by '{'"); + } else { str++; @@ -1489,15 +1507,19 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, if (str < limit && *str == '}') { - if (name == str && identifier_pos) + if (identifier_pos && name == str) { + cpp_warning (pfile, CPP_W_UNICODE, + "empty named universal character escape " + "sequence; treating it as separate tokens"); *cp = 0; return false; } if (name == str) cpp_error (pfile, CPP_DL_ERROR, "empty named universal character escape sequence"); - else if (!CPP_OPTION (pfile, delimited_escape_seqs) + else if ((!identifier_pos || strict) + && !CPP_OPTION (pfile, delimited_escape_seqs) && CPP_OPTION (pfile, cpp_pedantic)) cpp_error (pfile, CPP_DL_PEDWARN, "named universal character escapes are only valid " @@ -1515,27 +1537,51 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, uname2c_tree, NULL); if (result == (cppchar_t) -1) { - cpp_error (pfile, CPP_DL_ERROR, - "\\N{%.*s} is not a valid universal " - "character", (int) (str - name), name); + bool ret = true; + if (identifier_pos + && (!CPP_OPTION (pfile, delimited_escape_seqs) + || !strict)) + ret = cpp_warning (pfile, CPP_W_UNICODE, + "\\N{%.*s} is not a valid " + "universal character; treating it " + "as separate tokens", + (int) (str - name), name); + else + cpp_error (pfile, CPP_DL_ERROR, + "\\N{%.*s} is not a valid universal " + "character", (int) (str - name), name); /* Try to do a loose name lookup according to Unicode loose matching rule UAX44-LM2. */ char canon_name[uname2c_max_name_len + 1]; result = _cpp_uname2c_uax44_lm2 ((const char *) name, str - name, canon_name); - if (result != (cppchar_t) -1) + if (result != (cppchar_t) -1 && ret) cpp_error (pfile, CPP_DL_NOTE, "did you mean \\N{%s}?", canon_name); else - result = 0x40; + result = 0xC0; + if (identifier_pos + && (!CPP_OPTION (pfile, delimited_escape_seqs) + || !strict)) + { + *cp = 0; + return false; + } } } str++; extend_char_range (char_range, loc_reader); } else if (identifier_pos) - length = 1; + { + cpp_warning (pfile, CPP_W_UNICODE, + "'\\N{' not terminated with '}' after %.*s; " + "treating it as separate tokens", + (int) (str - base), base); + *cp = 0; + return false; + } else { cpp_error (pfile, CPP_DL_ERROR, @@ -1584,12 +1630,17 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, } while (--length); - if (delimited - && str < limit - && *str == '}' - && (length != 32 || !identifier_pos)) + if (delimited && str < limit && *str == '}') { - if (length == 32) + if (length == 32 && identifier_pos) + { + cpp_warning (pfile, CPP_W_UNICODE, + "empty delimited escape sequence; " + "treating it as separate tokens"); + *cp = 0; + return false; + } + else if (length == 32) cpp_error (pfile, CPP_DL_ERROR, "empty delimited escape sequence"); else if (!CPP_OPTION (pfile, delimited_escape_seqs) @@ -1607,6 +1658,11 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, error message in that case. */ if (length && identifier_pos) { + if (delimited) + cpp_warning (pfile, CPP_W_UNICODE, + "'\\u{' not terminated with '}' after %.*s; " + "treating it as separate tokens", + (int) (str - base), base); *cp = 0; return false; } diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index 1a3fb19..c25bcf2 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -565,6 +565,10 @@ struct cpp_options 2 if it should be a pedwarn. */ unsigned char cpp_warn_invalid_utf8; + /* True if libcpp should warn about invalid forms of delimited or named + escape sequences. */ + bool cpp_warn_unicode; + /* True if -finput-charset= option has been used explicitly. */ bool cpp_input_charset_explicit; @@ -675,7 +679,8 @@ enum cpp_warning_reason { CPP_W_CXX20_COMPAT, CPP_W_EXPANSION_TO_DEFINED, CPP_W_BIDIRECTIONAL, - CPP_W_INVALID_UTF8 + CPP_W_INVALID_UTF8, + CPP_W_UNICODE }; /* Callback for header lookup for HEADER, which is the name of a diff --git a/libcpp/init.cc b/libcpp/init.cc index 3e5601a..6292524 100644 --- a/libcpp/init.cc +++ b/libcpp/init.cc @@ -228,6 +228,7 @@ cpp_create_reader (enum c_lang lang, cpp_hash_table *table, CPP_OPTION (pfile, warn_date_time) = 0; CPP_OPTION (pfile, cpp_warn_bidirectional) = bidirectional_unpaired; CPP_OPTION (pfile, cpp_warn_invalid_utf8) = 0; + CPP_OPTION (pfile, cpp_warn_unicode) = 1; CPP_OPTION (pfile, cpp_input_charset_explicit) = 0; /* Default CPP arithmetic to something sensible for the host for the |