diff options
author | Corentin Jabot <corentin.jabot@gmail.com> | 2021-09-14 09:11:23 -0400 |
---|---|---|
committer | Aaron Ballman <aaron@aaronballman.com> | 2021-09-14 09:12:22 -0400 |
commit | 601102d282d5e9a1429fea52ee17303aec8a7c10 (patch) | |
tree | 524c34d1746fc2a62524f8ec2650c7f912196810 /clang/lib | |
parent | 9bbc0c1ffb47f9cf4c9d8e9a0e8100002fe5aafb (diff) | |
download | llvm-601102d282d5e9a1429fea52ee17303aec8a7c10.zip llvm-601102d282d5e9a1429fea52ee17303aec8a7c10.tar.gz llvm-601102d282d5e9a1429fea52ee17303aec8a7c10.tar.bz2 |
Cleanup identifier parsing; NFC
Rename methods to clearly signal when they only deal with ASCII,
simplify the parsing of identifier, and use start/continue instead of
head/body for consistency with Unicode terminology.
Diffstat (limited to 'clang/lib')
-rw-r--r-- | clang/lib/ARCMigrate/ObjCMT.cpp | 2 | ||||
-rw-r--r-- | clang/lib/ARCMigrate/TransUnbridgedCasts.cpp | 3 | ||||
-rw-r--r-- | clang/lib/AST/MicrosoftMangle.cpp | 2 | ||||
-rw-r--r-- | clang/lib/Basic/Module.cpp | 2 | ||||
-rw-r--r-- | clang/lib/Edit/EditedSource.cpp | 4 | ||||
-rw-r--r-- | clang/lib/Frontend/LayoutOverrideSource.cpp | 4 | ||||
-rw-r--r-- | clang/lib/Frontend/Rewrite/FrontendActions.cpp | 2 | ||||
-rw-r--r-- | clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp | 34 | ||||
-rw-r--r-- | clang/lib/Lex/Lexer.cpp | 254 | ||||
-rw-r--r-- | clang/lib/Lex/ModuleMap.cpp | 4 | ||||
-rw-r--r-- | clang/lib/Sema/SemaAvailability.cpp | 2 | ||||
-rw-r--r-- | clang/lib/Sema/SemaDeclAttr.cpp | 10 | ||||
-rw-r--r-- | clang/lib/Sema/SemaExprObjC.cpp | 4 | ||||
-rw-r--r-- | clang/lib/Sema/SemaType.cpp | 4 | ||||
-rw-r--r-- | clang/lib/Tooling/Transformer/Parsing.cpp | 2 |
15 files changed, 160 insertions, 173 deletions
diff --git a/clang/lib/ARCMigrate/ObjCMT.cpp b/clang/lib/ARCMigrate/ObjCMT.cpp index c8069b5..c8a389d 100644 --- a/clang/lib/ARCMigrate/ObjCMT.cpp +++ b/clang/lib/ARCMigrate/ObjCMT.cpp @@ -1144,7 +1144,7 @@ static bool AttributesMatch(const Decl *Decl1, const Decl *Decl2, static bool IsValidIdentifier(ASTContext &Ctx, const char *Name) { - if (!isIdentifierHead(Name[0])) + if (!isAsciiIdentifierStart(Name[0])) return false; std::string NameString = Name; NameString[0] = toLowercase(NameString[0]); diff --git a/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp b/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp index e767ad5..b143645 100644 --- a/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp +++ b/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp @@ -253,7 +253,8 @@ private: SourceManager &SM = Pass.Ctx.getSourceManager(); char PrevChar = *SM.getCharacterData(InsertLoc.getLocWithOffset(-1)); - if (Lexer::isIdentifierBodyChar(PrevChar, Pass.Ctx.getLangOpts())) + if (Lexer::isAsciiIdentifierContinueChar(PrevChar, + Pass.Ctx.getLangOpts())) BridgeCall += ' '; if (Kind == OBC_BridgeTransfer) diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index d3d6508..63fc7e42 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -3884,7 +3884,7 @@ void MicrosoftMangleContextImpl::mangleStringLiteral(const StringLiteral *SL, // - ?[A-Z]: The range from \xc1 to \xda. // - ?[0-9]: The set of [,/\:. \n\t'-]. // - ?$XX: A fallback which maps nibbles. - if (isIdentifierBody(Byte, /*AllowDollar=*/true)) { + if (isAsciiIdentifierContinue(Byte, /*AllowDollar=*/true)) { Mangler.getStream() << Byte; } else if (isLetter(Byte & 0x7f)) { Mangler.getStream() << '?' << static_cast<char>(Byte & 0x7f); diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp index b6cf162..4ec0699 100644 --- a/clang/lib/Basic/Module.cpp +++ b/clang/lib/Basic/Module.cpp @@ -203,7 +203,7 @@ static void printModuleId(raw_ostream &OS, InputIter Begin, InputIter End, OS << "."; StringRef Name = getModuleNameFromComponent(*It); - if (!AllowStringLiterals || isValidIdentifier(Name)) + if (!AllowStringLiterals || isValidAsciiIdentifier(Name)) OS << Name; else { OS << '"'; diff --git a/clang/lib/Edit/EditedSource.cpp b/clang/lib/Edit/EditedSource.cpp index 74e6005..43da345 100644 --- a/clang/lib/Edit/EditedSource.cpp +++ b/clang/lib/Edit/EditedSource.cpp @@ -314,8 +314,8 @@ bool EditedSource::commit(const Commit &commit) { static bool canBeJoined(char left, char right, const LangOptions &LangOpts) { // FIXME: Should use TokenConcatenation to make sure we don't allow stuff like // making two '<' adjacent. - return !(Lexer::isIdentifierBodyChar(left, LangOpts) && - Lexer::isIdentifierBodyChar(right, LangOpts)); + return !(Lexer::isAsciiIdentifierContinueChar(left, LangOpts) && + Lexer::isAsciiIdentifierContinueChar(right, LangOpts)); } /// Returns true if it is ok to eliminate the trailing whitespace between diff --git a/clang/lib/Frontend/LayoutOverrideSource.cpp b/clang/lib/Frontend/LayoutOverrideSource.cpp index 76762d5..c735c6c 100644 --- a/clang/lib/Frontend/LayoutOverrideSource.cpp +++ b/clang/lib/Frontend/LayoutOverrideSource.cpp @@ -16,11 +16,11 @@ using namespace clang; /// Parse a simple identifier. static std::string parseName(StringRef S) { - if (S.empty() || !isIdentifierHead(S[0])) + if (S.empty() || !isAsciiIdentifierStart(S[0])) return ""; unsigned Offset = 1; - while (Offset < S.size() && isIdentifierBody(S[Offset])) + while (Offset < S.size() && isAsciiIdentifierContinue(S[Offset])) ++Offset; return S.substr(0, Offset).str(); diff --git a/clang/lib/Frontend/Rewrite/FrontendActions.cpp b/clang/lib/Frontend/Rewrite/FrontendActions.cpp index 09ed07b..6685109 100644 --- a/clang/lib/Frontend/Rewrite/FrontendActions.cpp +++ b/clang/lib/Frontend/Rewrite/FrontendActions.cpp @@ -231,7 +231,7 @@ public: assert(OS && "loaded module file after finishing rewrite action?"); (*OS) << "#pragma clang module build "; - if (isValidIdentifier(MF->ModuleName)) + if (isValidAsciiIdentifier(MF->ModuleName)) (*OS) << MF->ModuleName; else { (*OS) << '"'; diff --git a/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp b/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp index cfca167..f5cbd5e 100644 --- a/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp +++ b/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp @@ -131,17 +131,17 @@ LLVM_NODISCARD static bool isRawStringLiteral(const char *First, --Current; if (*Current != 'R') return false; - if (First == Current || !isIdentifierBody(*--Current)) + if (First == Current || !isAsciiIdentifierContinue(*--Current)) return true; // Check for a prefix of "u", "U", or "L". if (*Current == 'u' || *Current == 'U' || *Current == 'L') - return First == Current || !isIdentifierBody(*--Current); + return First == Current || !isAsciiIdentifierContinue(*--Current); // Check for a prefix of "u8". if (*Current != '8' || First == Current || *Current-- != 'u') return false; - return First == Current || !isIdentifierBody(*--Current); + return First == Current || !isAsciiIdentifierContinue(*--Current); } static void skipRawString(const char *&First, const char *const End) { @@ -319,7 +319,7 @@ static bool isQuoteCppDigitSeparator(const char *const Start, if (!isPreprocessingNumberBody(Prev)) return false; // The next character should be a valid identifier body character. - return (Cur + 1) < End && isIdentifierBody(*(Cur + 1)); + return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1)); } static void skipLine(const char *&First, const char *const End) { @@ -484,7 +484,7 @@ void Minimizer::printAdjacentModuleNameParts(const char *&First, const char *Last = First; do ++Last; - while (Last != End && (isIdentifierBody(*Last) || *Last == '.')); + while (Last != End && (isAsciiIdentifierContinue(*Last) || *Last == '.')); append(First, Last); First = Last; } @@ -507,7 +507,7 @@ bool Minimizer::printAtImportBody(const char *&First, const char *const End) { } // Don't handle macro expansions inside @import for now. - if (!isIdentifierBody(*First) && *First != '.') + if (!isAsciiIdentifierContinue(*First) && *First != '.') return true; printAdjacentModuleNameParts(First, End); @@ -524,9 +524,9 @@ void Minimizer::printDirectiveBody(const char *&First, const char *const End) { LLVM_NODISCARD static const char *lexRawIdentifier(const char *First, const char *const End) { - assert(isIdentifierBody(*First) && "invalid identifer"); + assert(isAsciiIdentifierContinue(*First) && "invalid identifer"); const char *Last = First + 1; - while (Last != End && isIdentifierBody(*Last)) + while (Last != End && isAsciiIdentifierContinue(*Last)) ++Last; return Last; } @@ -540,7 +540,7 @@ getIdentifierContinuation(const char *First, const char *const End) { skipNewline(First, End); if (First == End) return nullptr; - return isIdentifierBody(First[0]) ? First : nullptr; + return isAsciiIdentifierContinue(First[0]) ? First : nullptr; } Minimizer::IdInfo Minimizer::lexIdentifier(const char *First, @@ -569,7 +569,7 @@ void Minimizer::printAdjacentMacroArgs(const char *&First, do ++Last; while (Last != End && - (isIdentifierBody(*Last) || *Last == '.' || *Last == ',')); + (isAsciiIdentifierContinue(*Last) || *Last == '.' || *Last == ',')); append(First, Last); First = Last; } @@ -588,7 +588,7 @@ bool Minimizer::printMacroArgs(const char *&First, const char *const End) { } // This is intentionally fairly liberal. - if (!(isIdentifierBody(*First) || *First == '.' || *First == ',')) + if (!(isAsciiIdentifierContinue(*First) || *First == '.' || *First == ',')) return true; printAdjacentMacroArgs(First, End); @@ -602,7 +602,7 @@ bool Minimizer::printMacroArgs(const char *&First, const char *const End) { bool Minimizer::isNextIdentifier(StringRef Id, const char *&First, const char *const End) { skipWhitespace(First, End); - if (First == End || !isIdentifierHead(*First)) + if (First == End || !isAsciiIdentifierStart(*First)) return false; IdInfo FoundId = lexIdentifier(First, End); @@ -639,7 +639,7 @@ bool Minimizer::lexModule(const char *&First, const char *const End) { if (Id.Name == "export") { Export = true; skipWhitespace(First, End); - if (!isIdentifierBody(*First)) { + if (!isAsciiIdentifierContinue(*First)) { skipLine(First, End); return false; } @@ -663,7 +663,7 @@ bool Minimizer::lexModule(const char *&First, const char *const End) { case '"': break; default: - if (!isIdentifierBody(*First)) { + if (!isAsciiIdentifierContinue(*First)) { skipLine(First, End); return false; } @@ -690,7 +690,7 @@ bool Minimizer::lexDefine(const char *&First, const char *const End) { append("#define "); skipWhitespace(First, End); - if (!isIdentifierHead(*First)) + if (!isAsciiIdentifierStart(*First)) return reportError(First, diag::err_pp_macro_not_identifier); IdInfo Id = lexIdentifier(First, End); @@ -722,7 +722,7 @@ bool Minimizer::lexDefine(const char *&First, const char *const End) { bool Minimizer::lexPragma(const char *&First, const char *const End) { // #pragma. skipWhitespace(First, End); - if (First == End || !isIdentifierHead(*First)) + if (First == End || !isAsciiIdentifierStart(*First)) return false; IdInfo FoundId = lexIdentifier(First, End); @@ -827,7 +827,7 @@ bool Minimizer::lexPPLine(const char *&First, const char *const End) { if (First == End) return reportError(First, diag::err_pp_expected_eol); - if (!isIdentifierHead(*First)) { + if (!isAsciiIdentifierStart(*First)) { skipLine(First, End); return false; } diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index 05b84e0..2685924 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -1062,8 +1062,8 @@ StringRef Lexer::getImmediateMacroNameForDiagnostics( return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); } -bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { - return isIdentifierBody(c, LangOpts.DollarIdents); +bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) { + return isAsciiIdentifierContinue(c, LangOpts.DollarIdents); } bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { @@ -1712,103 +1712,128 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { return true; } -bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) { - // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] - unsigned Size; - unsigned char C = *CurPtr++; - while (isIdentifierBody(C)) - C = *CurPtr++; - - --CurPtr; // Back up over the skipped character. - - // Fast path, no $,\,? in identifier found. '\' might be an escaped newline - // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. - // - // TODO: Could merge these checks into an InfoTable flag to make the - // comparison cheaper - if (isASCII(C) && C != '\\' && C != '?' && - (C != '$' || !LangOpts.DollarIdents)) { -FinishIdentifier: - const char *IdStart = BufferPtr; - FormTokenWithChars(Result, CurPtr, tok::raw_identifier); - Result.setRawIdentifierData(IdStart); - - // If we are in raw mode, return this identifier raw. There is no need to - // look up identifier information or attempt to macro expand it. - if (LexingRawMode) - return true; - - // Fill in Result.IdentifierInfo and update the token kind, - // looking up the identifier in the identifier table. - IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); - // Note that we have to call PP->LookUpIdentifierInfo() even for code - // completion, it writes IdentifierInfo into Result, and callers rely on it. - - // If the completion point is at the end of an identifier, we want to treat - // the identifier as incomplete even if it resolves to a macro or a keyword. - // This allows e.g. 'class^' to complete to 'classifier'. - if (isCodeCompletionPoint(CurPtr)) { - // Return the code-completion token. - Result.setKind(tok::code_completion); - // Skip the code-completion char and all immediate identifier characters. - // This ensures we get consistent behavior when completing at any point in - // an identifier (i.e. at the start, in the middle, at the end). Note that - // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code - // simpler. - assert(*CurPtr == 0 && "Completion character must be 0"); - ++CurPtr; - // Note that code completion token is not added as a separate character - // when the completion point is at the end of the buffer. Therefore, we need - // to check if the buffer has ended. - if (CurPtr < BufferEnd) { - while (isIdentifierBody(*CurPtr)) - ++CurPtr; - } - BufferPtr = CurPtr; - return true; +bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, + const char *CurPtr) { + if (isAllowedInitiallyIDChar(C, LangOpts)) { + if (!isLexingRawMode() && !ParsingPreprocessorDirective && + !PP->isPreprocessedOutput()) { + maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, + makeCharRange(*this, BufferPtr, CurPtr), + /*IsFirst=*/true); + maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, + makeCharRange(*this, BufferPtr, CurPtr)); } - // Finally, now that we know we have an identifier, pass this off to the - // preprocessor, which may macro expand it or something. - if (II->isHandleIdentifierCase()) - return PP->HandleIdentifier(Result); + MIOpt.ReadToken(); + return LexIdentifierContinue(Result, CurPtr); + } - return true; + if (!isLexingRawMode() && !ParsingPreprocessorDirective && + !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && + !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) { + // Non-ASCII characters tend to creep into source code unintentionally. + // Instead of letting the parser complain about the unknown token, + // just drop the character. + // Note that we can /only/ do this when the non-ASCII character is actually + // spelled as Unicode, not written as a UCN. The standard requires that + // we not throw away any possible preprocessor tokens, but there's a + // loophole in the mapping of Unicode characters to basic character set + // characters that allows us to map these particular characters to, say, + // whitespace. + diagnoseInvalidUnicodeCodepointInIdentifier( + PP->getDiagnostics(), LangOpts, C, + makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); + BufferPtr = CurPtr; + return false; } - // Otherwise, $,\,? in identifier found. Enter slower path. + // Otherwise, we have an explicit UCN or a character that's unlikely to show + // up by accident. + MIOpt.ReadToken(); + FormTokenWithChars(Result, CurPtr, tok::unknown); + return true; +} - C = getCharAndSize(CurPtr, Size); +bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) { + // Match [_A-Za-z0-9]*, we have already matched an identifier start. while (true) { + unsigned char C = *CurPtr; + // Fast path. + if (isAsciiIdentifierContinue(C)) { + ++CurPtr; + continue; + } + + unsigned Size; + // Slow path: handle trigraph, unicode codepoints, UCNs. + C = getCharAndSize(CurPtr, Size); + if (isAsciiIdentifierContinue(C)) { + CurPtr = ConsumeChar(CurPtr, Size, Result); + continue; + } if (C == '$') { // If we hit a $ and they are not supported in identifiers, we are done. - if (!LangOpts.DollarIdents) goto FinishIdentifier; - + if (!LangOpts.DollarIdents) + break; // Otherwise, emit a diagnostic and continue. if (!isLexingRawMode()) Diag(CurPtr, diag::ext_dollar_in_identifier); CurPtr = ConsumeChar(CurPtr, Size, Result); - C = getCharAndSize(CurPtr, Size); continue; - } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { - C = getCharAndSize(CurPtr, Size); + } + if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) continue; - } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { - C = getCharAndSize(CurPtr, Size); + if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) continue; - } else if (!isIdentifierBody(C)) { - goto FinishIdentifier; - } + // Neither an expected Unicode codepoint nor a UCN. + break; + } - // Otherwise, this character is good, consume it. - CurPtr = ConsumeChar(CurPtr, Size, Result); + const char *IdStart = BufferPtr; + FormTokenWithChars(Result, CurPtr, tok::raw_identifier); + Result.setRawIdentifierData(IdStart); - C = getCharAndSize(CurPtr, Size); - while (isIdentifierBody(C)) { - CurPtr = ConsumeChar(CurPtr, Size, Result); - C = getCharAndSize(CurPtr, Size); + // If we are in raw mode, return this identifier raw. There is no need to + // look up identifier information or attempt to macro expand it. + if (LexingRawMode) + return true; + + // Fill in Result.IdentifierInfo and update the token kind, + // looking up the identifier in the identifier table. + IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); + // Note that we have to call PP->LookUpIdentifierInfo() even for code + // completion, it writes IdentifierInfo into Result, and callers rely on it. + + // If the completion point is at the end of an identifier, we want to treat + // the identifier as incomplete even if it resolves to a macro or a keyword. + // This allows e.g. 'class^' to complete to 'classifier'. + if (isCodeCompletionPoint(CurPtr)) { + // Return the code-completion token. + Result.setKind(tok::code_completion); + // Skip the code-completion char and all immediate identifier characters. + // This ensures we get consistent behavior when completing at any point in + // an identifier (i.e. at the start, in the middle, at the end). Note that + // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code + // simpler. + assert(*CurPtr == 0 && "Completion character must be 0"); + ++CurPtr; + // Note that code completion token is not added as a separate character + // when the completion point is at the end of the buffer. Therefore, we need + // to check if the buffer has ended. + if (CurPtr < BufferEnd) { + while (isAsciiIdentifierContinue(*CurPtr)) + ++CurPtr; } + BufferPtr = CurPtr; + return true; } + + // Finally, now that we know we have an identifier, pass this off to the + // preprocessor, which may macro expand it or something. + if (II->isHandleIdentifierCase()) + return PP->HandleIdentifier(Result); + + return true; } /// isHexaLiteral - Return true if Start points to a hex constant. @@ -1864,7 +1889,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { if (C == '\'' && (getLangOpts().CPlusPlus14 || getLangOpts().C2x)) { unsigned NextSize; char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts()); - if (isIdentifierBody(Next)) { + if (isAsciiIdentifierContinue(Next)) { if (!isLexingRawMode()) Diag(CurPtr, getLangOpts().CPlusPlus ? diag::warn_cxx11_compat_digit_separator @@ -1899,7 +1924,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, char C = getCharAndSize(CurPtr, Size); bool Consumed = false; - if (!isIdentifierHead(C)) { + if (!isAsciiIdentifierStart(C)) { if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) Consumed = true; else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) @@ -1938,7 +1963,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, unsigned NextSize; char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, getLangOpts()); - if (!isIdentifierBody(Next)) { + if (!isAsciiIdentifierContinue(Next)) { // End of suffix. Check whether this is on the allowed list. const StringRef CompleteSuffix(Buffer, Chars); IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(), @@ -1970,10 +1995,12 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, Result.setFlag(Token::HasUDSuffix); while (true) { C = getCharAndSize(CurPtr, Size); - if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); } - else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {} - else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {} - else break; + if (isAsciiIdentifierContinue(C)) { + CurPtr = ConsumeChar(CurPtr, Size, Result); + } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { + } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { + } else + break; } return CurPtr; @@ -3205,47 +3232,6 @@ bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, return false; } -bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { - if (isAllowedInitiallyIDChar(C, LangOpts)) { - if (!isLexingRawMode() && !ParsingPreprocessorDirective && - !PP->isPreprocessedOutput()) { - maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, - makeCharRange(*this, BufferPtr, CurPtr), - /*IsFirst=*/true); - maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, - makeCharRange(*this, BufferPtr, CurPtr)); - } - - MIOpt.ReadToken(); - return LexIdentifier(Result, CurPtr); - } - - if (!isLexingRawMode() && !ParsingPreprocessorDirective && - !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && - !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) { - // Non-ASCII characters tend to creep into source code unintentionally. - // Instead of letting the parser complain about the unknown token, - // just drop the character. - // Note that we can /only/ do this when the non-ASCII character is actually - // spelled as Unicode, not written as a UCN. The standard requires that - // we not throw away any possible preprocessor tokens, but there's a - // loophole in the mapping of Unicode characters to basic character set - // characters that allows us to map these particular characters to, say, - // whitespace. - diagnoseInvalidUnicodeCodepointInIdentifier( - PP->getDiagnostics(), LangOpts, C, - makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); - BufferPtr = CurPtr; - return false; - } - - // Otherwise, we have an explicit UCN or a character that's unlikely to show - // up by accident. - MIOpt.ReadToken(); - FormTokenWithChars(Result, CurPtr, tok::unknown); - return true; -} - void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { IsAtStartOfLine = Result.isAtStartOfLine(); HasLeadingSpace = Result.hasLeadingSpace(); @@ -3489,7 +3475,7 @@ LexNextToken: } // treat u like the start of an identifier. - return LexIdentifier(Result, CurPtr); + return LexIdentifierContinue(Result, CurPtr); case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal // Notify MIOpt that we read a non-whitespace/non-comment token. @@ -3518,7 +3504,7 @@ LexNextToken: } // treat U like the start of an identifier. - return LexIdentifier(Result, CurPtr); + return LexIdentifierContinue(Result, CurPtr); case 'R': // Identifier or C++0x raw string literal // Notify MIOpt that we read a non-whitespace/non-comment token. @@ -3534,7 +3520,7 @@ LexNextToken: } // treat R like the start of an identifier. - return LexIdentifier(Result, CurPtr); + return LexIdentifierContinue(Result, CurPtr); case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). // Notify MIOpt that we read a non-whitespace/non-comment token. @@ -3573,7 +3559,7 @@ LexNextToken: case '_': // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); - return LexIdentifier(Result, CurPtr); + return LexIdentifierContinue(Result, CurPtr); case '$': // $ in identifiers. if (LangOpts.DollarIdents) { @@ -3581,7 +3567,7 @@ LexNextToken: Diag(CurPtr-1, diag::ext_dollar_in_identifier); // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); - return LexIdentifier(Result, CurPtr); + return LexIdentifierContinue(Result, CurPtr); } Kind = tok::unknown; @@ -3996,7 +3982,7 @@ LexNextToken: goto LexNextToken; } - return LexUnicode(Result, CodePoint, CurPtr); + return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); } } @@ -4028,7 +4014,7 @@ LexNextToken: // (We manually eliminate the tail call to avoid recursion.) goto LexNextToken; } - return LexUnicode(Result, CodePoint, CurPtr); + return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); } if (isLexingRawMode() || ParsingPreprocessorDirective || diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index f9af7c2..8475417 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -338,7 +338,7 @@ static StringRef sanitizeFilenameAsIdentifier(StringRef Name, if (Name.empty()) return Name; - if (!isValidIdentifier(Name)) { + if (!isValidAsciiIdentifier(Name)) { // If we don't already have something with the form of an identifier, // create a buffer with the sanitized name. Buffer.clear(); @@ -346,7 +346,7 @@ static StringRef sanitizeFilenameAsIdentifier(StringRef Name, Buffer.push_back('_'); Buffer.reserve(Buffer.size() + Name.size()); for (unsigned I = 0, N = Name.size(); I != N; ++I) { - if (isIdentifierBody(Name[I])) + if (isAsciiIdentifierContinue(Name[I])) Buffer.push_back(Name[I]); else Buffer.push_back('_'); diff --git a/clang/lib/Sema/SemaAvailability.cpp b/clang/lib/Sema/SemaAvailability.cpp index edbeced..d1c3d25 100644 --- a/clang/lib/Sema/SemaAvailability.cpp +++ b/clang/lib/Sema/SemaAvailability.cpp @@ -268,7 +268,7 @@ tryParseObjCMethodName(StringRef Name, SmallVectorImpl<StringRef> &SlotNames, for (StringRef S : SlotNames) { if (S.empty()) continue; - if (!isValidIdentifier(S, AllowDollar)) + if (!isValidAsciiIdentifier(S, AllowDollar)) return None; } return NumParams; diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index f93db4b..35c3204 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -6105,7 +6105,7 @@ validateSwiftFunctionName(Sema &S, const ParsedAttr &AL, SourceLocation Loc, if (BaseName.empty()) { BaseName = ContextName; ContextName = StringRef(); - } else if (ContextName.empty() || !isValidIdentifier(ContextName)) { + } else if (ContextName.empty() || !isValidAsciiIdentifier(ContextName)) { S.Diag(Loc, diag::warn_attr_swift_name_invalid_identifier) << AL << /*context*/ 1; return false; @@ -6113,7 +6113,7 @@ validateSwiftFunctionName(Sema &S, const ParsedAttr &AL, SourceLocation Loc, IsMember = true; } - if (!isValidIdentifier(BaseName) || BaseName == "_") { + if (!isValidAsciiIdentifier(BaseName) || BaseName == "_") { S.Diag(Loc, diag::warn_attr_swift_name_invalid_identifier) << AL << /*basename*/ 0; return false; @@ -6163,7 +6163,7 @@ validateSwiftFunctionName(Sema &S, const ParsedAttr &AL, SourceLocation Loc, do { std::tie(CurrentParam, Parameters) = Parameters.split(':'); - if (!isValidIdentifier(CurrentParam)) { + if (!isValidAsciiIdentifier(CurrentParam)) { S.Diag(Loc, diag::warn_attr_swift_name_invalid_identifier) << AL << /*parameter*/2; return false; @@ -6332,13 +6332,13 @@ bool Sema::DiagnoseSwiftName(Decl *D, StringRef Name, SourceLocation Loc, if (BaseName.empty()) { BaseName = ContextName; ContextName = StringRef(); - } else if (!isValidIdentifier(ContextName)) { + } else if (!isValidAsciiIdentifier(ContextName)) { Diag(Loc, diag::warn_attr_swift_name_invalid_identifier) << AL << /*context*/1; return false; } - if (!isValidIdentifier(BaseName)) { + if (!isValidAsciiIdentifier(BaseName)) { Diag(Loc, diag::warn_attr_swift_name_invalid_identifier) << AL << /*basename*/0; return false; diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp index 102edb0..bdc8e1e 100644 --- a/clang/lib/Sema/SemaExprObjC.cpp +++ b/clang/lib/Sema/SemaExprObjC.cpp @@ -3772,7 +3772,7 @@ static void addFixitForObjCARCConversion( SourceManager &SM = S.getSourceManager(); char PrevChar = *SM.getCharacterData(range.getBegin().getLocWithOffset(-1)); - if (Lexer::isIdentifierBodyChar(PrevChar, S.getLangOpts())) + if (Lexer::isAsciiIdentifierContinueChar(PrevChar, S.getLangOpts())) BridgeCall += ' '; BridgeCall += CFBridgeName; @@ -3790,7 +3790,7 @@ static void addFixitForObjCARCConversion( SourceManager &SM = S.getSourceManager(); char PrevChar = *SM.getCharacterData(range.getBegin().getLocWithOffset(-1)); - if (Lexer::isIdentifierBodyChar(PrevChar, S.getLangOpts())) + if (Lexer::isAsciiIdentifierContinueChar(PrevChar, S.getLangOpts())) BridgeCall += ' '; BridgeCall += CFBridgeName; diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 69c9de3..dcf18d3 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -4260,8 +4260,8 @@ static void fixItNullability(Sema &S, DiagBuilderT &Diag, InsertionText = InsertionText.drop_back().drop_front(); else InsertionText = InsertionText.drop_front(); - } else if (!isIdentifierBody(NextChar[0], /*allow dollar*/true) && - !isIdentifierBody(NextChar[-1], /*allow dollar*/true)) { + } else if (!isAsciiIdentifierContinue(NextChar[0], /*allow dollar*/ true) && + !isAsciiIdentifierContinue(NextChar[-1], /*allow dollar*/ true)) { InsertionText = InsertionText.drop_back().drop_front(); } diff --git a/clang/lib/Tooling/Transformer/Parsing.cpp b/clang/lib/Tooling/Transformer/Parsing.cpp index 66fa04a..242db2a 100644 --- a/clang/lib/Tooling/Transformer/Parsing.cpp +++ b/clang/lib/Tooling/Transformer/Parsing.cpp @@ -165,7 +165,7 @@ static ExpectedProgress<llvm::NoneType> parseChar(char c, ParseState State) { static ExpectedProgress<std::string> parseId(ParseState State) { State.Input = consumeWhitespace(State.Input); auto Id = State.Input.take_while( - [](char c) { return isASCII(c) && isIdentifierBody(c); }); + [](char c) { return isASCII(c) && isAsciiIdentifierContinue(c); }); if (Id.empty()) return makeParseError(State, "failed to parse name"); return makeParseProgress(advance(State, Id.size()), Id.str()); |