From 8f51cf38bb9628546effe66c070188d10f80b5ca Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Thu, 6 May 2021 23:20:35 +0000 Subject: preprocessor: Fix pp-number lexing of digit separators [PR83873, PR97604] When the preprocessor lexes preprocessing numbers in lex_number, it accepts digit separators in more cases than actually permitted in pp-numbers by the standard syntax. One thing this accepts is adjacent digit separators; there is some code to reject those later, but as noted in bug 83873 it fails to cover the case of adjacent digit separators within a floating-point exponent. Accepting adjacent digit separators only results in a missing diagnostic, not in valid code being rejected or being accepted with incorrect semantics, because the correct lexing in such a case would have '' start the following preprocessing tokens, and no valid preprocessing token starts '' while ' isn't valid on its own as a preprocessing token either. So this patch fixes that case by moving the error for adjacent digit separators to lex_number (allowing a more specific diagnostic than if '' were excluded from the pp-number completely). Other cases inappropriately accepted involve digit separators before '.', 'e+', 'e-', 'p+' or 'p-' (or corresponding uppercase variants). In those cases, as shown by the test digit-sep-pp-number.C added, this can result in valid code being wrongly rejected as a result of too many characters being included in the pp-number. So this case is fixed by terminating the pp-number at the correct character according to the standard. That test also covers the case where a digit separator was followed by an identifier-nondigit that is not a nondigit (e.g. a UCN); that case was already handled correctly. Bootstrapped with no regressions for x86_64-pc-linux-gnu. libcpp/ PR c++/83873 PR preprocessor/97604 * lex.c (lex_number): Reject adjacent digit separators here. Do not allow digit separators before '.' or an exponent with sign. * expr.c (cpp_classify_number): Do not check for adjacent digit separators here. gcc/testsuite/ PR c++/83873 PR preprocessor/97604 * g++.dg/cpp1y/digit-sep-neg-2.C, g++.dg/cpp1y/digit-sep-pp-number.C: New tests. * g++.dg/cpp1y/digit-sep-line-neg.C, g++.dg/cpp1y/digit-sep-neg.C: Adjust expected messages. --- libcpp/lex.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'libcpp/lex.c') diff --git a/libcpp/lex.c b/libcpp/lex.c index 06bcc31..9662f1b 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -1548,18 +1548,28 @@ lex_number (cpp_reader *pfile, cpp_string *number, base = pfile->buffer->cur - 1; do { + const uchar *adj_digit_sep = NULL; cur = pfile->buffer->cur; /* N.B. ISIDNUM does not include $. */ - while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur) - || VALID_SIGN (*cur, cur[-1])) + while (ISIDNUM (*cur) + || (*cur == '.' && !DIGIT_SEP (cur[-1])) + || DIGIT_SEP (*cur) + || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2]))) { NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur); + /* Adjacent digit separators do not form part of the pp-number syntax. + However, they can safely be diagnosed here as an error, since '' is + not a valid preprocessing token. */ + if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep) + adj_digit_sep = cur; cur++; } /* A number can't end with a digit separator. */ while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1])) --cur; + if (adj_digit_sep && adj_digit_sep < cur) + cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators"); pfile->buffer->cur = cur; } -- cgit v1.1 From 170c850e4bd46745e2a5130b5eb09f9fceb98416 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 7 May 2021 17:48:37 +0200 Subject: libcpp: Fix up pragma preprocessing [PR100450] Since the r0-85991-ga25a8f3be322fe0f838947b679f73d6efc2a412c https://gcc.gnu.org/legacy-ml/gcc-patches/2008-02/msg01329.html changes, so that we handle macros inside of pragmas that should expand macros, during preprocessing we print those pragmas token by token, with CPP_PRAGMA printed as fputs ("#pragma ", print.outf); if (space) fprintf (print.outf, "%s %s", space, name); else fprintf (print.outf, "%s", name); where name is some identifier (so e.g. print #pragma omp parallel or #pragma omp for etc.). Because it ends in an identifier, we need to handle it like an identifier (i.e. CPP_NAME) for the decision whether a space needs to be emitted in between that #pragma whatever or #pragma whatever whatever and following token, otherwise the attached testcase is preprocessed as #pragma omp forreduction(+:red) rather than #pragma omp for reduction(+:red) The cpp_avoid_paste function is only called for this purpose. 2021-05-07 Jakub Jelinek PR c/100450 * lex.c (cpp_avoid_paste): Handle token1 CPP_PRAGMA like CPP_NAME. * c-c++-common/gomp/pr100450.c: New test. --- libcpp/lex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'libcpp/lex.c') diff --git a/libcpp/lex.c b/libcpp/lex.c index 9662f1b..b7ce85a 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -3719,6 +3719,7 @@ cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1, case CPP_DEREF: return c == '*'; case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER; case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */ + case CPP_PRAGMA: case CPP_NAME: return ((b == CPP_NUMBER && name_p (pfile, &token2->val.str)) || b == CPP_NAME -- cgit v1.1 From 3e3fdf3d5217e5a2d075ca399b557b2e886dcd18 Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Tue, 11 May 2021 18:54:32 +0000 Subject: preprocessor: Fix cpp_avoid_paste for digit separators The libcpp function cpp_avoid_paste is used to insert whitespace in preprocessed output where needed to avoid two consecutive preprocessing tokens, that logically (e.g. when stringized) do not have whitespace between them, from being incorrectly lexed as one when the preprocessed input is reread by a compiler. This fails to allow for digit separators, so meaning that invalid code, that has a CPP_NUMBER (from a macro expansion) followed by a character literal, can result in preprocessed output with a valid use of digit separators, so that required syntax errors do not occur when compiling with -save-temps. Fix this by handling that case in cpp_avoid_paste (as with other cases in cpp_avoid_paste, this doesn't try to check whether the language version in use supports digit separators; it's always OK to have unnecessary whitespace in preprocessed output). Note: there are other cases, with various kinds of wide character or string literal following a CPP_NUMBER, where spurious pasting of preprocessing tokens can occur but the sequence of tokens remains invalid both before and after that pasting. Maybe cpp_avoid_paste should also handle those cases (and similar cases after a CPP_NAME), to ensure the sequence of preprocessing tokens in preprocessed output is exactly right, whether or not it affects whether syntax errors occur. This patch only addresses the case with digit separators where invalid code can fail to be diagnosed without the space inserted. Bootstrapped with no regressions for x86_64-pc-linux-gnu. libcpp/ * lex.c (cpp_avoid_paste): Do not allow pasting CPP_NUMBER with CPP_CHAR. gcc/testsuite/ * g++.dg/cpp1y/digit-sep-paste.C, gcc.dg/c2x-digit-separators-3.c: New tests. --- libcpp/lex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'libcpp/lex.c') diff --git a/libcpp/lex.c b/libcpp/lex.c index b7ce85a..36cd2e3 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -3725,6 +3725,7 @@ cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1, || b == CPP_NAME || b == CPP_CHAR || b == CPP_STRING); /* L */ case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME + || b == CPP_CHAR || c == '.' || c == '+' || c == '-'); /* UCNs */ case CPP_OTHER: return ((token1->val.str.text[0] == '\\' -- cgit v1.1 From c6b664e2c4c127025e076d8b584abe0976694629 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 12 May 2021 15:14:35 +0200 Subject: libcpp: Fix up -fdirectives-only preprocessing of includes not ending with newline [PR100392] If a header doesn't end with a new-line, with -fdirectives-only we right now preprocess it as int i = 1;# 2 "pr100392.c" 2 i.e. the line directive isn't on the next line, which means we fail to parse it when compiling. GCC 10 and earlier libcpp/directives-only.c had for this: if (!pfile->state.skipping && cur != base) { /* If the file was not newline terminated, add rlimit, which is guaranteed to point to a newline, to the end of our range. */ if (cur[-1] != '\n') { cur++; CPP_INCREMENT_LINE (pfile, 0); lines++; } cb->print_lines (lines, base, cur - base); } and we have the assertion /* Files always end in a newline or carriage return. We rely on this for character peeking safety. */ gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r'); So, this patch just does readd the more less same thing, so that we emit a newline after the inline even when it wasn't there before. 2021-05-12 Jakub Jelinek PR preprocessor/100392 * lex.c (cpp_directive_only_process): If buffer doesn't end with '\n', add buffer->rlimit[0] character to the printed range and CPP_INCREMENT_LINE and increment line_count. * gcc.dg/cpp/pr100392.c: New test. * gcc.dg/cpp/pr100392.h: New file. --- libcpp/lex.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'libcpp/lex.c') diff --git a/libcpp/lex.c b/libcpp/lex.c index 36cd2e3..6fd722a 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -4768,7 +4768,18 @@ cpp_directive_only_process (cpp_reader *pfile, } if (buffer->rlimit > base && !pfile->state.skipping) - cb (pfile, CPP_DO_print, data, line_count, base, buffer->rlimit - base); + { + const unsigned char *limit = buffer->rlimit; + /* If the file was not newline terminated, add rlimit, which is + guaranteed to point to a newline, to the end of our range. */ + if (limit[-1] != '\n') + { + limit++; + CPP_INCREMENT_LINE (pfile, 0); + line_count++; + } + cb (pfile, CPP_DO_print, data, line_count, base, limit - base); + } _cpp_pop_buffer (pfile); } -- cgit v1.1 From d15a2d261b24adcbfe5e663b15dde3df5d2b3486 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 20 May 2021 09:09:07 +0200 Subject: libcpp: Fix up -fdirectives-only handling of // comments on last line not terminated with newline [PR100646] As can be seen on the testcases, before the -fdirectives-only preprocessing rewrite the preprocessor would assume // comments are terminated by the end of file even when newline wasn't there, but now we error out. The following patch restores the previous behavior. 2021-05-20 Jakub Jelinek PR preprocessor/100646 * lex.c (cpp_directive_only_process): Treat end of file as termination for !is_block comments. * gcc.dg/cpp/pr100646-1.c: New test. * gcc.dg/cpp/pr100646-2.c: New test. --- libcpp/lex.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'libcpp/lex.c') diff --git a/libcpp/lex.c b/libcpp/lex.c index 6fd722a..3618fa5 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -4480,8 +4480,9 @@ cpp_directive_only_process (cpp_reader *pfile, break; } } - cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0, - "unterminated comment"); + if (pos < limit || is_block) + cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0, + "unterminated comment"); done_comment: lwm = pos; break; -- cgit v1.1 From c4d6dcacfca1b804504515496e6d9de176d7f51e Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 1 Sep 2021 22:33:06 +0200 Subject: libcpp: Implement C++23 P1949R7 - C++ Identifier Syntax using Unicode Standard Annex 31 The following patch implements the P1949R7 - C++ Identifier Syntax using Unicode Standard Annex 31 paper. We already allow UTF-8 characters in the source, so that part is already implemented, so IMHO all we need to do is pedwarn instead of just warn for the (default) -Wnormalize=nfc (or for -Wnormalize={id,nkfc}) if the character is not in NFC and to use the unicode XID_Start and XID_Continue derived code properties to find out what characters are allowed (the standard actually adds U+005F to XID_Start, but we are handling the ASCII compatible characters differently already and they aren't allowed in UCNs in identifiers). Instead of hardcoding the large tables in ucnid.tab, this patch makes makeucnid.c read them from the Unicode tables (13.0.0 version at this point). For non-pedantic mode, we accept as 2nd+ char in identifiers a union of valid characters in all supported modes, but for the 1st char it was actually pedantically requiring that it is not any of the characters that may not appear in the currently chosen standard as the first character. This patch changes it such that also what is allowed at the start of an identifier is a union of characters valid at the start of an identifier in any of the pedantic modes. 2021-09-01 Jakub Jelinek PR c++/100977 libcpp/ * include/cpplib.h (struct cpp_options): Add cxx23_identifiers. * charset.c (CXX23, NXX23): New enumerators. (CID, NFC, NKC, CTX): Renumber. (ucn_valid_in_identifier): Implement P1949R7 - use CXX23 and NXX23 flags for cxx23_identifiers. For start character in non-pedantic mode, allow characters that are allowed as start characters in any of the supported language modes, rather than disallowing characters allowed only as non-start characters in current mode but for characters from other language modes allowing them even if they are never allowed at start. * init.c (struct lang_flags): Add cxx23_identifiers. (lang_defaults): Add cxx23_identifiers column. (cpp_set_lang): Initialize CPP_OPTION (pfile, cxx23_identifiers). * lex.c (warn_about_normalization): If cxx23_identifiers, use cpp_pedwarning_with_line instead of cpp_warning_with_line for "is not in NFC" diagnostics. * makeucnid.c: Adjust usage comment. (CXX23, NXX23): New enumerators. (all_languages): Add CXX23. (not_NFC, not_NFKC, maybe_not_NFC): Renumber. (read_derivedcore): New function. (write_table): Print also CXX23 and NXX23 columns. (main): Require 5 arguments instead of 4, call read_derivedcore. * ucnid.h: Regenerated using Unicode 13.0.0 files. gcc/testsuite/ * g++.dg/cpp23/normalize1.C: New test. * g++.dg/cpp23/normalize2.C: New test. * g++.dg/cpp23/normalize3.C: New test. * g++.dg/cpp23/normalize4.C: New test. * g++.dg/cpp23/normalize5.C: New test. * g++.dg/cpp23/normalize6.C: New test. * g++.dg/cpp23/normalize7.C: New test. * g++.dg/cpp23/ucnid-1-utf8.C: New test. * g++.dg/cpp23/ucnid-2-utf8.C: New test. * gcc.dg/cpp/ucnid-4.c: Don't expect "not valid at the start of an identifier" errors. * gcc.dg/cpp/ucnid-4-utf8.c: Likewise. * gcc.dg/cpp/ucnid-5-utf8.c: New test. --- libcpp/lex.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'libcpp/lex.c') diff --git a/libcpp/lex.c b/libcpp/lex.c index 3618fa5..8e3ef09 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -1306,6 +1306,9 @@ warn_about_normalization (cpp_reader *pfile, if (NORMALIZE_STATE_RESULT (s) == normalized_C) cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, "`%.*s' is not in NFKC", (int) sz, buf); + else if (CPP_OPTION (pfile, cxx23_identifiers)) + cpp_pedwarning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, + "`%.*s' is not in NFC", (int) sz, buf); else cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, "`%.*s' is not in NFC", (int) sz, buf); -- cgit v1.1