diff options
author | Jakub Jelinek <jakub@redhat.com> | 2022-09-07 08:44:38 +0200 |
---|---|---|
committer | Jakub Jelinek <jakub@redhat.com> | 2022-09-07 08:44:38 +0200 |
commit | 572f5e1bc68e131b25cd2d5ba231e932f5038904 (patch) | |
tree | 3bb768b7f06160f88ca0aaa8f0e2f5df0408ab7e /libcpp | |
parent | ea6e89e07f4223c8ac7877508c62bba368084999 (diff) | |
download | gcc-572f5e1bc68e131b25cd2d5ba231e932f5038904.zip gcc-572f5e1bc68e131b25cd2d5ba231e932f5038904.tar.gz gcc-572f5e1bc68e131b25cd2d5ba231e932f5038904.tar.bz2 |
libcpp: Named universal character escapes and delimited escape sequence tweaks
On Tue, Aug 30, 2022 at 09:10:37PM +0000, Joseph Myers wrote:
> I'm seeing build failures of glibc for powerpc64, as illustrated by the
> following C code:
>
> #if 0
> \NARG
> #endif
>
> (the actual sysdeps/powerpc/powerpc64/sysdep.h code is inside #ifdef
> __ASSEMBLER__).
>
> This shows some problems with this feature - and with delimited escape
> sequences - as it affects C. It's fine to accept it as an extension
> inside string and character literals, because \N or \u{...} would be
> invalid in the absence of the feature (i.e. the syntax for such literals
> fails to match, meaning that the rule about undefined behavior for a
> single ' or " as a pp-token applies). But outside string and character
> literals, the usual lexing rules apply, the \ is a pp-token on its own and
> the code is valid at the preprocessing level, and with expansion of macros
> appearing before or after the \ (e.g. u defined as a macro in the \u{...}
> case) it may be valid code at the language level as well. I don't know
> what older C++ versions say about this, but for C this means e.g.
>
> #define z(x) 0
> #define a z(
> int x = a\NARG);
>
> needs to be accepted as expanding to "int x = 0;", not interpreted as
> using the \N feature in an identifier and produce an error.
The following patch changes this, so that:
1) outside of string/character literals, \N without following { is never
treated as an error nor warning, it is silently treated as \ separate
token followed by whatever is after it
2) \u{123} and \N{LATIN SMALL LETTER A WITH ACUTE} are not handled as
extension at all outside of string/character literals in the strict
standard modes (-std=c*) except for -std=c++{23,2b}, only in the
-std=gnu* modes, because it changes behavior on valid sources, e.g.
#define z(x) 0
#define a z(
int x = a\u{123});
int y = a\N{LATIN SMALL LETTER A WITH ACUTE});
3) introduces -Wunicode warning (on by default) and warns for cases
of what looks like invalid delimited escape sequence or named
universal character escape outside of string/character literals
and is treated as separate tokens
2022-09-07 Jakub Jelinek <jakub@redhat.com>
libcpp/
* include/cpplib.h (struct cpp_options): Add cpp_warn_unicode member.
(enum cpp_warning_reason): Add CPP_W_UNICODE.
* init.cc (cpp_create_reader): Initialize cpp_warn_unicode.
* charset.cc (_cpp_valid_ucn): In possible identifier contexts, don't
handle \u{ or \N{ specially in -std=c* modes except -std=c++2{3,b}.
In possible identifier contexts, don't emit an error and punt
if \N isn't followed by {, or if \N{} surrounds some lower case
letters or _. In possible identifier contexts when not C++23, don't
emit an error but warning about unknown character names and treat as
separate tokens. When treating as separate tokens \u{ or \N{, emit
warnings.
gcc/
* doc/invoke.texi (-Wno-unicode): Document.
gcc/c-family/
* c.opt (Winvalid-utf8): Use ObjC instead of objC. Remove
" in comments" from description.
(Wunicode): New option.
gcc/testsuite/
* c-c++-common/cpp/delimited-escape-seq-4.c: New test.
* c-c++-common/cpp/delimited-escape-seq-5.c: New test.
* c-c++-common/cpp/delimited-escape-seq-6.c: New test.
* c-c++-common/cpp/delimited-escape-seq-7.c: New test.
* c-c++-common/cpp/named-universal-char-escape-5.c: New test.
* c-c++-common/cpp/named-universal-char-escape-6.c: New test.
* c-c++-common/cpp/named-universal-char-escape-7.c: New test.
* g++.dg/cpp23/named-universal-char-escape1.C: New test.
* g++.dg/cpp23/named-universal-char-escape2.C: New test.
Diffstat (limited to 'libcpp')
-rw-r--r-- | libcpp/charset.cc | 86 | ||||
-rw-r--r-- | libcpp/include/cpplib.h | 7 | ||||
-rw-r--r-- | libcpp/init.cc | 1 |
3 files changed, 78 insertions, 16 deletions
diff --git a/libcpp/charset.cc b/libcpp/charset.cc index c9656db..6834969 100644 --- a/libcpp/charset.cc +++ b/libcpp/charset.cc @@ -1448,7 +1448,11 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, if (str[-1] == 'u') { length = 4; - if (str < limit && *str == '{') + if (str < limit + && *str == '{' + && (!identifier_pos + || CPP_OPTION (pfile, delimited_escape_seqs) + || !CPP_OPTION (pfile, std))) { str++; /* Magic value to indicate no digits seen. */ @@ -1462,8 +1466,22 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, else if (str[-1] == 'N') { length = 4; + if (identifier_pos + && !CPP_OPTION (pfile, delimited_escape_seqs) + && CPP_OPTION (pfile, std)) + { + *cp = 0; + return false; + } if (str == limit || *str != '{') - cpp_error (pfile, CPP_DL_ERROR, "'\\N' not followed by '{'"); + { + if (identifier_pos) + { + *cp = 0; + return false; + } + cpp_error (pfile, CPP_DL_ERROR, "'\\N' not followed by '{'"); + } else { str++; @@ -1489,15 +1507,19 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, if (str < limit && *str == '}') { - if (name == str && identifier_pos) + if (identifier_pos && name == str) { + cpp_warning (pfile, CPP_W_UNICODE, + "empty named universal character escape " + "sequence; treating it as separate tokens"); *cp = 0; return false; } if (name == str) cpp_error (pfile, CPP_DL_ERROR, "empty named universal character escape sequence"); - else if (!CPP_OPTION (pfile, delimited_escape_seqs) + else if ((!identifier_pos || strict) + && !CPP_OPTION (pfile, delimited_escape_seqs) && CPP_OPTION (pfile, cpp_pedantic)) cpp_error (pfile, CPP_DL_PEDWARN, "named universal character escapes are only valid " @@ -1515,27 +1537,51 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, uname2c_tree, NULL); if (result == (cppchar_t) -1) { - cpp_error (pfile, CPP_DL_ERROR, - "\\N{%.*s} is not a valid universal " - "character", (int) (str - name), name); + bool ret = true; + if (identifier_pos + && (!CPP_OPTION (pfile, delimited_escape_seqs) + || !strict)) + ret = cpp_warning (pfile, CPP_W_UNICODE, + "\\N{%.*s} is not a valid " + "universal character; treating it " + "as separate tokens", + (int) (str - name), name); + else + cpp_error (pfile, CPP_DL_ERROR, + "\\N{%.*s} is not a valid universal " + "character", (int) (str - name), name); /* Try to do a loose name lookup according to Unicode loose matching rule UAX44-LM2. */ char canon_name[uname2c_max_name_len + 1]; result = _cpp_uname2c_uax44_lm2 ((const char *) name, str - name, canon_name); - if (result != (cppchar_t) -1) + if (result != (cppchar_t) -1 && ret) cpp_error (pfile, CPP_DL_NOTE, "did you mean \\N{%s}?", canon_name); else - result = 0x40; + result = 0xC0; + if (identifier_pos + && (!CPP_OPTION (pfile, delimited_escape_seqs) + || !strict)) + { + *cp = 0; + return false; + } } } str++; extend_char_range (char_range, loc_reader); } else if (identifier_pos) - length = 1; + { + cpp_warning (pfile, CPP_W_UNICODE, + "'\\N{' not terminated with '}' after %.*s; " + "treating it as separate tokens", + (int) (str - base), base); + *cp = 0; + return false; + } else { cpp_error (pfile, CPP_DL_ERROR, @@ -1584,12 +1630,17 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, } while (--length); - if (delimited - && str < limit - && *str == '}' - && (length != 32 || !identifier_pos)) + if (delimited && str < limit && *str == '}') { - if (length == 32) + if (length == 32 && identifier_pos) + { + cpp_warning (pfile, CPP_W_UNICODE, + "empty delimited escape sequence; " + "treating it as separate tokens"); + *cp = 0; + return false; + } + else if (length == 32) cpp_error (pfile, CPP_DL_ERROR, "empty delimited escape sequence"); else if (!CPP_OPTION (pfile, delimited_escape_seqs) @@ -1607,6 +1658,11 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, error message in that case. */ if (length && identifier_pos) { + if (delimited) + cpp_warning (pfile, CPP_W_UNICODE, + "'\\u{' not terminated with '}' after %.*s; " + "treating it as separate tokens", + (int) (str - base), base); *cp = 0; return false; } diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index 1a3fb19..c25bcf2 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -565,6 +565,10 @@ struct cpp_options 2 if it should be a pedwarn. */ unsigned char cpp_warn_invalid_utf8; + /* True if libcpp should warn about invalid forms of delimited or named + escape sequences. */ + bool cpp_warn_unicode; + /* True if -finput-charset= option has been used explicitly. */ bool cpp_input_charset_explicit; @@ -675,7 +679,8 @@ enum cpp_warning_reason { CPP_W_CXX20_COMPAT, CPP_W_EXPANSION_TO_DEFINED, CPP_W_BIDIRECTIONAL, - CPP_W_INVALID_UTF8 + CPP_W_INVALID_UTF8, + CPP_W_UNICODE }; /* Callback for header lookup for HEADER, which is the name of a diff --git a/libcpp/init.cc b/libcpp/init.cc index 3e5601a..6292524 100644 --- a/libcpp/init.cc +++ b/libcpp/init.cc @@ -228,6 +228,7 @@ cpp_create_reader (enum c_lang lang, cpp_hash_table *table, CPP_OPTION (pfile, warn_date_time) = 0; CPP_OPTION (pfile, cpp_warn_bidirectional) = bidirectional_unpaired; CPP_OPTION (pfile, cpp_warn_invalid_utf8) = 0; + CPP_OPTION (pfile, cpp_warn_unicode) = 1; CPP_OPTION (pfile, cpp_input_charset_explicit) = 0; /* Default CPP arithmetic to something sensible for the host for the |