From 572f5e1bc68e131b25cd2d5ba231e932f5038904 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Wed, 7 Sep 2022 08:44:38 +0200
Subject: libcpp: Named universal character escapes and delimited escape
 sequence tweaks

On Tue, Aug 30, 2022 at 09:10:37PM +0000, Joseph Myers wrote:
> I'm seeing build failures of glibc for powerpc64, as illustrated by the
> following C code:
>
> #if 0
> \NARG
> #endif
>
> (the actual sysdeps/powerpc/powerpc64/sysdep.h code is inside #ifdef
> __ASSEMBLER__).
>
> This shows some problems with this feature - and with delimited escape
> sequences - as it affects C.  It's fine to accept it as an extension
> inside string and character literals, because \N or \u{...} would be
> invalid in the absence of the feature (i.e. the syntax for such literals
> fails to match, meaning that the rule about undefined behavior for a
> single ' or " as a pp-token applies).  But outside string and character
> literals, the usual lexing rules apply, the \ is a pp-token on its own and
> the code is valid at the preprocessing level, and with expansion of macros
> appearing before or after the \ (e.g. u defined as a macro in the \u{...}
> case) it may be valid code at the language level as well.  I don't know
> what older C++ versions say about this, but for C this means e.g.
>
> #define z(x) 0
> #define a z(
> int x = a\NARG);
>
> needs to be accepted as expanding to "int x = 0;", not interpreted as
> using the \N feature in an identifier and produce an error.

The following patch changes this, so that:
1) outside of string/character literals, \N without following { is never
   treated as an error nor warning, it is silently treated as \ separate
   token followed by whatever is after it
2) \u{123} and \N{LATIN SMALL LETTER A WITH ACUTE} are not handled as
   extension at all outside of string/character literals in the strict
   standard modes (-std=c*) except for -std=c++{23,2b}, only in the
   -std=gnu* modes, because it changes behavior on valid sources, e.g.
   #define z(x) 0
   #define a z(
   int x = a\u{123});
   int y = a\N{LATIN SMALL LETTER A WITH ACUTE});
3) introduces -Wunicode warning (on by default) and warns for cases
   of what looks like invalid delimited escape sequence or named
   universal character escape outside of string/character literals
   and is treated as separate tokens

2022-09-07  Jakub Jelinek  <jakub@redhat.com>

libcpp/
	* include/cpplib.h (struct cpp_options): Add cpp_warn_unicode member.
	(enum cpp_warning_reason): Add CPP_W_UNICODE.
	* init.cc (cpp_create_reader): Initialize cpp_warn_unicode.
	* charset.cc (_cpp_valid_ucn): In possible identifier contexts, don't
	handle \u{ or \N{ specially in -std=c* modes except -std=c++2{3,b}.
	In possible identifier contexts, don't emit an error and punt
	if \N isn't followed by {, or if \N{} surrounds some lower case
	letters or _.  In possible identifier contexts when not C++23, don't
	emit an error but warning about unknown character names and treat as
	separate tokens.  When treating as separate tokens \u{ or \N{, emit
	warnings.
gcc/
	* doc/invoke.texi (-Wno-unicode): Document.
gcc/c-family/
	* c.opt (Winvalid-utf8): Use ObjC instead of objC.  Remove
	" in comments" from description.
	(Wunicode): New option.
gcc/testsuite/
	* c-c++-common/cpp/delimited-escape-seq-4.c: New test.
	* c-c++-common/cpp/delimited-escape-seq-5.c: New test.
	* c-c++-common/cpp/delimited-escape-seq-6.c: New test.
	* c-c++-common/cpp/delimited-escape-seq-7.c: New test.
	* c-c++-common/cpp/named-universal-char-escape-5.c: New test.
	* c-c++-common/cpp/named-universal-char-escape-6.c: New test.
	* c-c++-common/cpp/named-universal-char-escape-7.c: New test.
	* g++.dg/cpp23/named-universal-char-escape1.C: New test.
	* g++.dg/cpp23/named-universal-char-escape2.C: New test.
---
 libcpp/include/cpplib.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'libcpp/include/cpplib.h')

diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
index 1a3fb19..c25bcf2 100644
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -565,6 +565,10 @@ struct cpp_options
      2 if it should be a pedwarn.  */
   unsigned char cpp_warn_invalid_utf8;
 
+  /* True if libcpp should warn about invalid forms of delimited or named
+     escape sequences.  */
+  bool cpp_warn_unicode;
+
   /* True if -finput-charset= option has been used explicitly.  */
   bool cpp_input_charset_explicit;
 
@@ -675,7 +679,8 @@ enum cpp_warning_reason {
   CPP_W_CXX20_COMPAT,
   CPP_W_EXPANSION_TO_DEFINED,
   CPP_W_BIDIRECTIONAL,
-  CPP_W_INVALID_UTF8
+  CPP_W_INVALID_UTF8,
+  CPP_W_UNICODE
 };
 
 /* Callback for header lookup for HEADER, which is the name of a
-- 
cgit v1.1


From 0a91bdaf177409a2a5e7895bce4f0e7091b4b3ca Mon Sep 17 00:00:00 2001
From: Joseph Myers <joseph@codesourcery.com>
Date: Wed, 7 Sep 2022 13:56:25 +0000
Subject: c: New C2x keywords

C2x follows C++ in making alignas, alignof, bool, false,
static_assert, thread_local and true keywords; implement this
accordingly.  This implementation makes them normal keywords in C2x
mode just like any other keyword (C2x leaves open the possibility of
implementation using predefined macros instead - thus, there aren't
any testcases asserting that they aren't macros).  As in C++ and
previous versions of C, true and false are handled like signed 1 and 0
in #if (there was an intermediate state in some C2x drafts where they
had different macro expansions that were unsigned in #if).

Bootstrapped with no regressions for x86_64-pc-linux-gnu.

As with the removal of unprototyped functions, this change has a high
risk of breaking some old code and people doing GNU/Linux distribution
builds may wish to see how much is broken in a build with a -std=gnu2x
default.

gcc/
	* ginclude/stdalign.h [defined __STDC_VERSION__ &&
	__STDC_VERSION__ > 201710L]: Disable all content.
	* ginclude/stdbool.h [defined __STDC_VERSION__ && __STDC_VERSION__
	> 201710L] (bool, true, false): Do not define.

gcc/c-family/
	* c-common.cc (c_common_reswords): Use D_C2X instead of D_CXXONLY
	for alignas, alignof, bool, false, static_assert, thread_local and
	true.

gcc/c/
	* c-parser.cc (c_parser_static_assert_declaration_no_semi)
	(c_parser_alignas_specifier, c_parser_alignof_expression): Allow
	for C2x spellings of keywords.
	(c_parser_postfix_expression): Handle RID_TRUE and RID_FALSE.

gcc/testsuite/
	* gcc.dg/c11-keywords-1.c, gcc.dg/c2x-align-1.c,
	gcc.dg/c2x-align-6.c, gcc.dg/c2x-bool-2.c,
	gcc.dg/c2x-static-assert-3.c, gcc.dg/c2x-static-assert-4.c,
	gcc.dg/c2x-thread-local-1.c: New tests.
	* gcc.dg/c2x-bool-1.c: Update expectations.

libcpp/
	* include/cpplib.h (struct cpp_options): Add true_false.
	* expr.cc (eval_token): Check true_false not cplusplus to
	determine whether to handle true and false keywords.
	* init.cc (struct lang_flags): Add true_false.
	(lang_defaults): Update.
	(cpp_set_lang): Set true_false.
---
 libcpp/include/cpplib.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'libcpp/include/cpplib.h')

diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
index c25bcf2..2db1e9c 100644
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -526,6 +526,9 @@ struct cpp_options
   /* Nonzero for C++23 delimited escape sequences.  */
   unsigned char delimited_escape_seqs;
 
+  /* Nonzero for 'true' and 'false' in #if expressions.  */
+  unsigned char true_false;
+
   /* Holds the name of the target (execution) character set.  */
   const char *narrow_charset;
 
-- 
cgit v1.1