aboutsummaryrefslogtreecommitdiff
path: root/clang/lib/Lex/Lexer.cpp
diff options
context:
space:
mode:
authorCorentin Jabot <corentinjabot@gmail.com>2023-06-23 13:02:37 +0200
committerCorentin Jabot <corentinjabot@gmail.com>2023-07-12 08:03:23 +0200
commit304e97469455b57f6496f68fef569eeaed3a2cd8 (patch)
tree0e3e341478bbdd5fdf419b1ae082a3b29f15c2ca /clang/lib/Lex/Lexer.cpp
parent20ae2d200dc94b051757174ec0f3a03103b8e1e2 (diff)
downloadllvm-304e97469455b57f6496f68fef569eeaed3a2cd8.zip
llvm-304e97469455b57f6496f68fef569eeaed3a2cd8.tar.gz
llvm-304e97469455b57f6496f68fef569eeaed3a2cd8.tar.bz2
[Clang] Correctly handle $, @, and ` when represented as UCN
This covers * P2558R2 (C++, wg21.link/P2558) * N2701 (C, https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2701.htm) * N3124 (C, https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3124.pdf) This patch * Disallow representing $ as a UCN in all language mode, which did not properly work (see GH62133), and which in made ill-formed in C++ and C by P2558 and N3124 respectively * Allow a UCN for any character in C2X, in string and character literals Fixes #62133 Reviewed By: #clang-language-wg, tahonermann Differential Revision: https://reviews.llvm.org/D153621
Diffstat (limited to 'clang/lib/Lex/Lexer.cpp')
-rw-r--r--clang/lib/Lex/Lexer.cpp14
1 files changed, 8 insertions, 6 deletions
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 82f65ef..3070407 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -3484,9 +3484,14 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
if (LangOpts.AsmPreprocessor)
return CodePoint;
- // C99 6.4.3p2: A universal character name shall not specify a character whose
- // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
- // 0060 (`), nor one in the range D800 through DFFF inclusive.)
+ // C2x 6.4.3p2: A universal character name shall not designate a code point
+ // where the hexadecimal value is:
+ // - in the range D800 through DFFF inclusive; or
+ // - greater than 10FFFF.
+ // A universal-character-name outside the c-char-sequence of a character
+ // constant, or the s-char-sequence of a string-literal shall not designate
+ // a control character or a character in the basic character set.
+
// C++11 [lex.charset]p2: If the hexadecimal value for a
// universal-character-name corresponds to a surrogate code point (in the
// range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
@@ -3496,9 +3501,6 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
// ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
// basic source character set, the program is ill-formed.
if (CodePoint < 0xA0) {
- if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
- return CodePoint;
-
// We don't use isLexingRawMode() here because we need to warn about bad
// UCNs even when skipping preprocessing tokens in a #if block.
if (Result && PP) {