From 8f51cf38bb9628546effe66c070188d10f80b5ca Mon Sep 17 00:00:00 2001
From: Joseph Myers <joseph@codesourcery.com>
Date: Thu, 6 May 2021 23:20:35 +0000
Subject: preprocessor: Fix pp-number lexing of digit separators [PR83873,
 PR97604]

When the preprocessor lexes preprocessing numbers in lex_number, it
accepts digit separators in more cases than actually permitted in
pp-numbers by the standard syntax.

One thing this accepts is adjacent digit separators; there is some
code to reject those later, but as noted in bug 83873 it fails to
cover the case of adjacent digit separators within a floating-point
exponent.  Accepting adjacent digit separators only results in a
missing diagnostic, not in valid code being rejected or being accepted
with incorrect semantics, because the correct lexing in such a case
would have '' start the following preprocessing tokens, and no valid
preprocessing token starts '' while ' isn't valid on its own as a
preprocessing token either.  So this patch fixes that case by moving
the error for adjacent digit separators to lex_number (allowing a more
specific diagnostic than if '' were excluded from the pp-number
completely).

Other cases inappropriately accepted involve digit separators before
'.', 'e+', 'e-', 'p+' or 'p-' (or corresponding uppercase variants).
In those cases, as shown by the test digit-sep-pp-number.C added, this
can result in valid code being wrongly rejected as a result of too
many characters being included in the pp-number.  So this case is
fixed by terminating the pp-number at the correct character according
to the standard.  That test also covers the case where a digit
separator was followed by an identifier-nondigit that is not a
nondigit (e.g. a UCN); that case was already handled correctly.

Bootstrapped with no regressions for x86_64-pc-linux-gnu.

libcpp/
	PR c++/83873
	PR preprocessor/97604
	* lex.c (lex_number): Reject adjacent digit separators here.  Do
	not allow digit separators before '.' or an exponent with sign.
	* expr.c (cpp_classify_number): Do not check for adjacent digit
	separators here.

gcc/testsuite/
	PR c++/83873
	PR preprocessor/97604
	* g++.dg/cpp1y/digit-sep-neg-2.C,
	g++.dg/cpp1y/digit-sep-pp-number.C: New tests.
	* g++.dg/cpp1y/digit-sep-line-neg.C, g++.dg/cpp1y/digit-sep-neg.C:
	Adjust expected messages.
---
 libcpp/lex.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'libcpp/lex.c')

diff --git a/libcpp/lex.c b/libcpp/lex.c
index 06bcc31..9662f1b 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -1548,18 +1548,28 @@ lex_number (cpp_reader *pfile, cpp_string *number,
   base = pfile->buffer->cur - 1;
   do
     {
+      const uchar *adj_digit_sep = NULL;
       cur = pfile->buffer->cur;
 
       /* N.B. ISIDNUM does not include $.  */
-      while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
-	     || VALID_SIGN (*cur, cur[-1]))
+      while (ISIDNUM (*cur)
+	     || (*cur == '.' && !DIGIT_SEP (cur[-1]))
+	     || DIGIT_SEP (*cur)
+	     || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
 	{
 	  NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
+	  /* Adjacent digit separators do not form part of the pp-number syntax.
+	     However, they can safely be diagnosed here as an error, since '' is
+	     not a valid preprocessing token.  */
+	  if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
+	    adj_digit_sep = cur;
 	  cur++;
 	}
       /* A number can't end with a digit separator.  */
       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
 	--cur;
+      if (adj_digit_sep && adj_digit_sep < cur)
+	cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
 
       pfile->buffer->cur = cur;
     }
-- 
cgit v1.1


From 170c850e4bd46745e2a5130b5eb09f9fceb98416 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Fri, 7 May 2021 17:48:37 +0200
Subject: libcpp: Fix up pragma preprocessing [PR100450]

Since the r0-85991-ga25a8f3be322fe0f838947b679f73d6efc2a412c
https://gcc.gnu.org/legacy-ml/gcc-patches/2008-02/msg01329.html
changes, so that we handle macros inside of pragmas that should expand
macros, during preprocessing we print those pragmas token by token,
with CPP_PRAGMA printed as
      fputs ("#pragma ", print.outf);
      if (space)
        fprintf (print.outf, "%s %s", space, name);
      else
        fprintf (print.outf, "%s", name);
where name is some identifier (so e.g. print
 #pragma omp parallel
or
 #pragma omp for
etc.).  Because it ends in an identifier, we need to handle it like
an identifier (i.e. CPP_NAME) for the decision whether a space needs
to be emitted in between that #pragma whatever or #pragma whatever whatever
and following token, otherwise the attached testcase is preprocessed as
 #pragma omp forreduction(+:red)
rather than
 #pragma omp for reduction(+:red)
The cpp_avoid_paste function is only called for this purpose.

2021-05-07  Jakub Jelinek  <jakub@redhat.com>

	PR c/100450
	* lex.c (cpp_avoid_paste): Handle token1 CPP_PRAGMA like CPP_NAME.

	* c-c++-common/gomp/pr100450.c: New test.
---
 libcpp/lex.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'libcpp/lex.c')

diff --git a/libcpp/lex.c b/libcpp/lex.c
index 9662f1b..b7ce85a 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -3719,6 +3719,7 @@ cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
     case CPP_DEREF:	return c == '*';
     case CPP_DOT:	return c == '.' || c == '%' || b == CPP_NUMBER;
     case CPP_HASH:	return c == '#' || c == '%'; /* Digraph form.  */
+    case CPP_PRAGMA:
     case CPP_NAME:	return ((b == CPP_NUMBER
 				 && name_p (pfile, &token2->val.str))
 				|| b == CPP_NAME
-- 
cgit v1.1


From 3e3fdf3d5217e5a2d075ca399b557b2e886dcd18 Mon Sep 17 00:00:00 2001
From: Joseph Myers <joseph@codesourcery.com>
Date: Tue, 11 May 2021 18:54:32 +0000
Subject: preprocessor: Fix cpp_avoid_paste for digit separators

The libcpp function cpp_avoid_paste is used to insert whitespace in
preprocessed output where needed to avoid two consecutive
preprocessing tokens, that logically (e.g. when stringized) do not
have whitespace between them, from being incorrectly lexed as one when
the preprocessed input is reread by a compiler.

This fails to allow for digit separators, so meaning that invalid
code, that has a CPP_NUMBER (from a macro expansion) followed by a
character literal, can result in preprocessed output with a valid use
of digit separators, so that required syntax errors do not occur when
compiling with -save-temps.  Fix this by handling that case in
cpp_avoid_paste (as with other cases in cpp_avoid_paste, this doesn't
try to check whether the language version in use supports digit
separators; it's always OK to have unnecessary whitespace in
preprocessed output).

Note: there are other cases, with various kinds of wide character or
string literal following a CPP_NUMBER, where spurious pasting of
preprocessing tokens can occur but the sequence of tokens remains
invalid both before and after that pasting.  Maybe cpp_avoid_paste
should also handle those cases (and similar cases after a CPP_NAME),
to ensure the sequence of preprocessing tokens in preprocessed output
is exactly right, whether or not it affects whether syntax errors
occur.  This patch only addresses the case with digit separators where
invalid code can fail to be diagnosed without the space inserted.

Bootstrapped with no regressions for x86_64-pc-linux-gnu.

libcpp/
	* lex.c (cpp_avoid_paste): Do not allow pasting CPP_NUMBER with
	CPP_CHAR.

gcc/testsuite/
	* g++.dg/cpp1y/digit-sep-paste.C, gcc.dg/c2x-digit-separators-3.c:
	New tests.
---
 libcpp/lex.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'libcpp/lex.c')

diff --git a/libcpp/lex.c b/libcpp/lex.c
index b7ce85a..36cd2e3 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -3725,6 +3725,7 @@ cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
 				|| b == CPP_NAME
 				|| b == CPP_CHAR || b == CPP_STRING); /* L */
     case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
+				|| b == CPP_CHAR
 				|| c == '.' || c == '+' || c == '-');
 				      /* UCNs */
     case CPP_OTHER:	return ((token1->val.str.text[0] == '\\'
-- 
cgit v1.1


From c6b664e2c4c127025e076d8b584abe0976694629 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Wed, 12 May 2021 15:14:35 +0200
Subject: libcpp: Fix up -fdirectives-only preprocessing of includes not ending
 with newline [PR100392]

If a header doesn't end with a new-line, with -fdirectives-only we right now
preprocess it as
int i = 1;# 2 "pr100392.c" 2
i.e. the line directive isn't on the next line, which means we fail to parse
it when compiling.

GCC 10 and earlier libcpp/directives-only.c had for this:
  if (!pfile->state.skipping && cur != base)
    {
      /* If the file was not newline terminated, add rlimit, which is
         guaranteed to point to a newline, to the end of our range.  */
      if (cur[-1] != '\n')
        {
          cur++;
          CPP_INCREMENT_LINE (pfile, 0);
          lines++;
        }

      cb->print_lines (lines, base, cur - base);
    }
and we have the assertion
      /* Files always end in a newline or carriage return.  We rely on this for
         character peeking safety.  */
      gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
So, this patch just does readd the more less same thing, so that we emit
a newline after the inline even when it wasn't there before.

2021-05-12  Jakub Jelinek  <jakub@redhat.com>

	PR preprocessor/100392
	* lex.c (cpp_directive_only_process): If buffer doesn't end with '\n',
	add buffer->rlimit[0] character to the printed range and
	CPP_INCREMENT_LINE and increment line_count.

	* gcc.dg/cpp/pr100392.c: New test.
	* gcc.dg/cpp/pr100392.h: New file.
---
 libcpp/lex.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'libcpp/lex.c')

diff --git a/libcpp/lex.c b/libcpp/lex.c
index 36cd2e3..6fd722a 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -4768,7 +4768,18 @@ cpp_directive_only_process (cpp_reader *pfile,
 	}
 
       if (buffer->rlimit > base && !pfile->state.skipping)
-	cb (pfile, CPP_DO_print, data, line_count, base, buffer->rlimit - base);
+	{
+	  const unsigned char *limit = buffer->rlimit;
+	  /* If the file was not newline terminated, add rlimit, which is
+	     guaranteed to point to a newline, to the end of our range.  */
+	  if (limit[-1] != '\n')
+	    {
+	      limit++;
+	      CPP_INCREMENT_LINE (pfile, 0);
+	      line_count++;
+	    }
+	  cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
+	}
 
       _cpp_pop_buffer (pfile);
     }
-- 
cgit v1.1


From d15a2d261b24adcbfe5e663b15dde3df5d2b3486 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Thu, 20 May 2021 09:09:07 +0200
Subject: libcpp: Fix up -fdirectives-only handling of // comments on last line
 not terminated with newline [PR100646]

As can be seen on the testcases, before the -fdirectives-only preprocessing
rewrite the preprocessor would assume // comments are terminated by the
end of file even when newline wasn't there, but now we error out.
The following patch restores the previous behavior.

2021-05-20  Jakub Jelinek  <jakub@redhat.com>

	PR preprocessor/100646
	* lex.c (cpp_directive_only_process): Treat end of file as termination
	for !is_block comments.

	* gcc.dg/cpp/pr100646-1.c: New test.
	* gcc.dg/cpp/pr100646-2.c: New test.
---
 libcpp/lex.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'libcpp/lex.c')

diff --git a/libcpp/lex.c b/libcpp/lex.c
index 6fd722a..3618fa5 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -4480,8 +4480,9 @@ cpp_directive_only_process (cpp_reader *pfile,
 			break;
 		      }
 		  }
-		cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
-				     "unterminated comment");
+		if (pos < limit || is_block)
+		  cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
+				       "unterminated comment");
 	      done_comment:
 		lwm = pos;
 		break;
-- 
cgit v1.1


From c4d6dcacfca1b804504515496e6d9de176d7f51e Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Wed, 1 Sep 2021 22:33:06 +0200
Subject: libcpp: Implement C++23 P1949R7 - C++ Identifier Syntax using Unicode
 Standard Annex 31

The following patch implements the
P1949R7 - C++ Identifier Syntax using Unicode Standard Annex 31
paper.  We already allow UTF-8 characters in the source, so that part
is already implemented, so IMHO all we need to do is pedwarn instead of
just warn for the (default) -Wnormalize=nfc (or for -Wnormalize={id,nkfc})
if the character is not in NFC and to use the unicode XID_Start and
XID_Continue derived code properties to find out what characters are allowed
(the standard actually adds U+005F to XID_Start, but we are handling the
ASCII compatible characters differently already and they aren't allowed
in UCNs in identifiers).  Instead of hardcoding the large tables
in ucnid.tab, this patch makes makeucnid.c read them from the Unicode
tables (13.0.0 version at this point).

For non-pedantic mode, we accept as 2nd+ char in identifiers a union
of valid characters in all supported modes, but for the 1st char it
was actually pedantically requiring that it is not any of the characters
that may not appear in the currently chosen standard as the first character.
This patch changes it such that also what is allowed at the start of an
identifier is a union of characters valid at the start of an identifier
in any of the pedantic modes.

2021-09-01  Jakub Jelinek  <jakub@redhat.com>

	PR c++/100977
libcpp/
	* include/cpplib.h (struct cpp_options): Add cxx23_identifiers.
	* charset.c (CXX23, NXX23): New enumerators.
	(CID, NFC, NKC, CTX): Renumber.
	(ucn_valid_in_identifier): Implement P1949R7 - use CXX23 and
	NXX23 flags for cxx23_identifiers.  For start character in
	non-pedantic mode, allow characters that are allowed as start
	characters in any of the supported language modes, rather than
	disallowing characters allowed only as non-start characters in
	current mode but for characters from other language modes allowing
	them even if they are never allowed at start.
	* init.c (struct lang_flags): Add cxx23_identifiers.
	(lang_defaults): Add cxx23_identifiers column.
	(cpp_set_lang): Initialize CPP_OPTION (pfile, cxx23_identifiers).
	* lex.c (warn_about_normalization): If cxx23_identifiers, use
	cpp_pedwarning_with_line instead of cpp_warning_with_line for
	"is not in NFC" diagnostics.
	* makeucnid.c: Adjust usage comment.
	(CXX23, NXX23): New enumerators.
	(all_languages): Add CXX23.
	(not_NFC, not_NFKC, maybe_not_NFC): Renumber.
	(read_derivedcore): New function.
	(write_table): Print also CXX23 and NXX23 columns.
	(main): Require 5 arguments instead of 4, call read_derivedcore.
	* ucnid.h: Regenerated using Unicode 13.0.0 files.
gcc/testsuite/
	* g++.dg/cpp23/normalize1.C: New test.
	* g++.dg/cpp23/normalize2.C: New test.
	* g++.dg/cpp23/normalize3.C: New test.
	* g++.dg/cpp23/normalize4.C: New test.
	* g++.dg/cpp23/normalize5.C: New test.
	* g++.dg/cpp23/normalize6.C: New test.
	* g++.dg/cpp23/normalize7.C: New test.
	* g++.dg/cpp23/ucnid-1-utf8.C: New test.
	* g++.dg/cpp23/ucnid-2-utf8.C: New test.
	* gcc.dg/cpp/ucnid-4.c: Don't expect
	"not valid at the start of an identifier" errors.
	* gcc.dg/cpp/ucnid-4-utf8.c: Likewise.
	* gcc.dg/cpp/ucnid-5-utf8.c: New test.
---
 libcpp/lex.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'libcpp/lex.c')

diff --git a/libcpp/lex.c b/libcpp/lex.c
index 3618fa5..8e3ef09 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -1306,6 +1306,9 @@ warn_about_normalization (cpp_reader *pfile,
       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 	cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
 			       "`%.*s' is not in NFKC", (int) sz, buf);
+      else if (CPP_OPTION (pfile, cxx23_identifiers))
+	cpp_pedwarning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
+				  "`%.*s' is not in NFC", (int) sz, buf);
       else
 	cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
 			       "`%.*s' is not in NFC", (int) sz, buf);
-- 
cgit v1.1