From 572f5e1bc68e131b25cd2d5ba231e932f5038904 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 7 Sep 2022 08:44:38 +0200 Subject: libcpp: Named universal character escapes and delimited escape sequence tweaks On Tue, Aug 30, 2022 at 09:10:37PM +0000, Joseph Myers wrote: > I'm seeing build failures of glibc for powerpc64, as illustrated by the > following C code: > > #if 0 > \NARG > #endif > > (the actual sysdeps/powerpc/powerpc64/sysdep.h code is inside #ifdef > __ASSEMBLER__). > > This shows some problems with this feature - and with delimited escape > sequences - as it affects C. It's fine to accept it as an extension > inside string and character literals, because \N or \u{...} would be > invalid in the absence of the feature (i.e. the syntax for such literals > fails to match, meaning that the rule about undefined behavior for a > single ' or " as a pp-token applies). But outside string and character > literals, the usual lexing rules apply, the \ is a pp-token on its own and > the code is valid at the preprocessing level, and with expansion of macros > appearing before or after the \ (e.g. u defined as a macro in the \u{...} > case) it may be valid code at the language level as well. I don't know > what older C++ versions say about this, but for C this means e.g. > > #define z(x) 0 > #define a z( > int x = a\NARG); > > needs to be accepted as expanding to "int x = 0;", not interpreted as > using the \N feature in an identifier and produce an error. The following patch changes this, so that: 1) outside of string/character literals, \N without following { is never treated as an error nor warning, it is silently treated as \ separate token followed by whatever is after it 2) \u{123} and \N{LATIN SMALL LETTER A WITH ACUTE} are not handled as extension at all outside of string/character literals in the strict standard modes (-std=c*) except for -std=c++{23,2b}, only in the -std=gnu* modes, because it changes behavior on valid sources, e.g. #define z(x) 0 #define a z( int x = a\u{123}); int y = a\N{LATIN SMALL LETTER A WITH ACUTE}); 3) introduces -Wunicode warning (on by default) and warns for cases of what looks like invalid delimited escape sequence or named universal character escape outside of string/character literals and is treated as separate tokens 2022-09-07 Jakub Jelinek libcpp/ * include/cpplib.h (struct cpp_options): Add cpp_warn_unicode member. (enum cpp_warning_reason): Add CPP_W_UNICODE. * init.cc (cpp_create_reader): Initialize cpp_warn_unicode. * charset.cc (_cpp_valid_ucn): In possible identifier contexts, don't handle \u{ or \N{ specially in -std=c* modes except -std=c++2{3,b}. In possible identifier contexts, don't emit an error and punt if \N isn't followed by {, or if \N{} surrounds some lower case letters or _. In possible identifier contexts when not C++23, don't emit an error but warning about unknown character names and treat as separate tokens. When treating as separate tokens \u{ or \N{, emit warnings. gcc/ * doc/invoke.texi (-Wno-unicode): Document. gcc/c-family/ * c.opt (Winvalid-utf8): Use ObjC instead of objC. Remove " in comments" from description. (Wunicode): New option. gcc/testsuite/ * c-c++-common/cpp/delimited-escape-seq-4.c: New test. * c-c++-common/cpp/delimited-escape-seq-5.c: New test. * c-c++-common/cpp/delimited-escape-seq-6.c: New test. * c-c++-common/cpp/delimited-escape-seq-7.c: New test. * c-c++-common/cpp/named-universal-char-escape-5.c: New test. * c-c++-common/cpp/named-universal-char-escape-6.c: New test. * c-c++-common/cpp/named-universal-char-escape-7.c: New test. * g++.dg/cpp23/named-universal-char-escape1.C: New test. * g++.dg/cpp23/named-universal-char-escape2.C: New test. --- libcpp/include/cpplib.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'libcpp/include/cpplib.h') diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index 1a3fb19..c25bcf2 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -565,6 +565,10 @@ struct cpp_options 2 if it should be a pedwarn. */ unsigned char cpp_warn_invalid_utf8; + /* True if libcpp should warn about invalid forms of delimited or named + escape sequences. */ + bool cpp_warn_unicode; + /* True if -finput-charset= option has been used explicitly. */ bool cpp_input_charset_explicit; @@ -675,7 +679,8 @@ enum cpp_warning_reason { CPP_W_CXX20_COMPAT, CPP_W_EXPANSION_TO_DEFINED, CPP_W_BIDIRECTIONAL, - CPP_W_INVALID_UTF8 + CPP_W_INVALID_UTF8, + CPP_W_UNICODE }; /* Callback for header lookup for HEADER, which is the name of a -- cgit v1.1 From 0a91bdaf177409a2a5e7895bce4f0e7091b4b3ca Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Wed, 7 Sep 2022 13:56:25 +0000 Subject: c: New C2x keywords C2x follows C++ in making alignas, alignof, bool, false, static_assert, thread_local and true keywords; implement this accordingly. This implementation makes them normal keywords in C2x mode just like any other keyword (C2x leaves open the possibility of implementation using predefined macros instead - thus, there aren't any testcases asserting that they aren't macros). As in C++ and previous versions of C, true and false are handled like signed 1 and 0 in #if (there was an intermediate state in some C2x drafts where they had different macro expansions that were unsigned in #if). Bootstrapped with no regressions for x86_64-pc-linux-gnu. As with the removal of unprototyped functions, this change has a high risk of breaking some old code and people doing GNU/Linux distribution builds may wish to see how much is broken in a build with a -std=gnu2x default. gcc/ * ginclude/stdalign.h [defined __STDC_VERSION__ && __STDC_VERSION__ > 201710L]: Disable all content. * ginclude/stdbool.h [defined __STDC_VERSION__ && __STDC_VERSION__ > 201710L] (bool, true, false): Do not define. gcc/c-family/ * c-common.cc (c_common_reswords): Use D_C2X instead of D_CXXONLY for alignas, alignof, bool, false, static_assert, thread_local and true. gcc/c/ * c-parser.cc (c_parser_static_assert_declaration_no_semi) (c_parser_alignas_specifier, c_parser_alignof_expression): Allow for C2x spellings of keywords. (c_parser_postfix_expression): Handle RID_TRUE and RID_FALSE. gcc/testsuite/ * gcc.dg/c11-keywords-1.c, gcc.dg/c2x-align-1.c, gcc.dg/c2x-align-6.c, gcc.dg/c2x-bool-2.c, gcc.dg/c2x-static-assert-3.c, gcc.dg/c2x-static-assert-4.c, gcc.dg/c2x-thread-local-1.c: New tests. * gcc.dg/c2x-bool-1.c: Update expectations. libcpp/ * include/cpplib.h (struct cpp_options): Add true_false. * expr.cc (eval_token): Check true_false not cplusplus to determine whether to handle true and false keywords. * init.cc (struct lang_flags): Add true_false. (lang_defaults): Update. (cpp_set_lang): Set true_false. --- libcpp/include/cpplib.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'libcpp/include/cpplib.h') diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index c25bcf2..2db1e9c 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -526,6 +526,9 @@ struct cpp_options /* Nonzero for C++23 delimited escape sequences. */ unsigned char delimited_escape_seqs; + /* Nonzero for 'true' and 'false' in #if expressions. */ + unsigned char true_false; + /* Holds the name of the target (execution) character set. */ const char *narrow_charset; -- cgit v1.1