Cleanup identifier parsing; NFC

Rename methods to clearly signal when they only deal with ASCII, simplify the parsing of identifier, and use start/continue instead of head/body for consistency with Unicode terminology.
author: Corentin Jabot <corentin.jabot@gmail.com> 2021-09-14 09:11:23 -0400
committer: Aaron Ballman <aaron@aaronballman.com> 2021-09-14 09:12:22 -0400
commit: 601102d282d5e9a1429fea52ee17303aec8a7c10 (patch)
tree: 524c34d1746fc2a62524f8ec2650c7f912196810 /clang/lib/Lex
parent: 9bbc0c1ffb47f9cf4c9d8e9a0e8100002fe5aafb (diff)
download: llvm-601102d282d5e9a1429fea52ee17303aec8a7c10.zip
llvm-601102d282d5e9a1429fea52ee17303aec8a7c10.tar.gz
llvm-601102d282d5e9a1429fea52ee17303aec8a7c10.tar.bz2
3 files changed, 139 insertions, 153 deletions
diff --git a/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp b/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp
index cfca167..f5cbd5e 100644
--- a/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp
+++ b/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp
@@ -131,17 +131,17 @@ LLVM_NODISCARD static bool isRawStringLiteral(const char *First,
   --Current;
   if (*Current != 'R')
     return false;
-  if (First == Current || !isIdentifierBody(*--Current))
+  if (First == Current || !isAsciiIdentifierContinue(*--Current))
     return true;
 
   // Check for a prefix of "u", "U", or "L".
   if (*Current == 'u' || *Current == 'U' || *Current == 'L')
-    return First == Current || !isIdentifierBody(*--Current);
+    return First == Current || !isAsciiIdentifierContinue(*--Current);
 
   // Check for a prefix of "u8".
   if (*Current != '8' || First == Current || *Current-- != 'u')
     return false;
-  return First == Current || !isIdentifierBody(*--Current);
+  return First == Current || !isAsciiIdentifierContinue(*--Current);
 }
 
 static void skipRawString(const char *&First, const char *const End) {
@@ -319,7 +319,7 @@ static bool isQuoteCppDigitSeparator(const char *const Start,
   if (!isPreprocessingNumberBody(Prev))
     return false;
   // The next character should be a valid identifier body character.
-  return (Cur + 1) < End && isIdentifierBody(*(Cur + 1));
+  return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1));
 }
 
 static void skipLine(const char *&First, const char *const End) {
@@ -484,7 +484,7 @@ void Minimizer::printAdjacentModuleNameParts(const char *&First,
   const char *Last = First;
   do
     ++Last;
-  while (Last != End && (isIdentifierBody(*Last) || *Last == '.'));
+  while (Last != End && (isAsciiIdentifierContinue(*Last) || *Last == '.'));
   append(First, Last);
   First = Last;
 }
@@ -507,7 +507,7 @@ bool Minimizer::printAtImportBody(const char *&First, const char *const End) {
     }
 
     // Don't handle macro expansions inside @import for now.
-    if (!isIdentifierBody(*First) && *First != '.')
+    if (!isAsciiIdentifierContinue(*First) && *First != '.')
       return true;
 
     printAdjacentModuleNameParts(First, End);
@@ -524,9 +524,9 @@ void Minimizer::printDirectiveBody(const char *&First, const char *const End) {
 
 LLVM_NODISCARD static const char *lexRawIdentifier(const char *First,
                                                    const char *const End) {
-  assert(isIdentifierBody(*First) && "invalid identifer");
+  assert(isAsciiIdentifierContinue(*First) && "invalid identifer");
   const char *Last = First + 1;
-  while (Last != End && isIdentifierBody(*Last))
+  while (Last != End && isAsciiIdentifierContinue(*Last))
     ++Last;
   return Last;
 }
@@ -540,7 +540,7 @@ getIdentifierContinuation(const char *First, const char *const End) {
   skipNewline(First, End);
   if (First == End)
     return nullptr;
-  return isIdentifierBody(First[0]) ? First : nullptr;
+  return isAsciiIdentifierContinue(First[0]) ? First : nullptr;
 }
 
 Minimizer::IdInfo Minimizer::lexIdentifier(const char *First,
@@ -569,7 +569,7 @@ void Minimizer::printAdjacentMacroArgs(const char *&First,
   do
     ++Last;
   while (Last != End &&
-         (isIdentifierBody(*Last) || *Last == '.' || *Last == ','));
+         (isAsciiIdentifierContinue(*Last) || *Last == '.' || *Last == ','));
   append(First, Last);
   First = Last;
 }
@@ -588,7 +588,7 @@ bool Minimizer::printMacroArgs(const char *&First, const char *const End) {
     }
 
     // This is intentionally fairly liberal.
-    if (!(isIdentifierBody(*First) || *First == '.' || *First == ','))
+    if (!(isAsciiIdentifierContinue(*First) || *First == '.' || *First == ','))
       return true;
 
     printAdjacentMacroArgs(First, End);
@@ -602,7 +602,7 @@ bool Minimizer::printMacroArgs(const char *&First, const char *const End) {
 bool Minimizer::isNextIdentifier(StringRef Id, const char *&First,
                                  const char *const End) {
   skipWhitespace(First, End);
-  if (First == End || !isIdentifierHead(*First))
+  if (First == End || !isAsciiIdentifierStart(*First))
     return false;
 
   IdInfo FoundId = lexIdentifier(First, End);
@@ -639,7 +639,7 @@ bool Minimizer::lexModule(const char *&First, const char *const End) {
   if (Id.Name == "export") {
     Export = true;
     skipWhitespace(First, End);
-    if (!isIdentifierBody(*First)) {
+    if (!isAsciiIdentifierContinue(*First)) {
       skipLine(First, End);
       return false;
     }
@@ -663,7 +663,7 @@ bool Minimizer::lexModule(const char *&First, const char *const End) {
   case '"':
     break;
   default:
-    if (!isIdentifierBody(*First)) {
+    if (!isAsciiIdentifierContinue(*First)) {
       skipLine(First, End);
       return false;
     }
@@ -690,7 +690,7 @@ bool Minimizer::lexDefine(const char *&First, const char *const End) {
   append("#define ");
   skipWhitespace(First, End);
 
-  if (!isIdentifierHead(*First))
+  if (!isAsciiIdentifierStart(*First))
     return reportError(First, diag::err_pp_macro_not_identifier);
 
   IdInfo Id = lexIdentifier(First, End);
@@ -722,7 +722,7 @@ bool Minimizer::lexDefine(const char *&First, const char *const End) {
 bool Minimizer::lexPragma(const char *&First, const char *const End) {
   // #pragma.
   skipWhitespace(First, End);
-  if (First == End || !isIdentifierHead(*First))
+  if (First == End || !isAsciiIdentifierStart(*First))
     return false;
 
   IdInfo FoundId = lexIdentifier(First, End);
@@ -827,7 +827,7 @@ bool Minimizer::lexPPLine(const char *&First, const char *const End) {
   if (First == End)
     return reportError(First, diag::err_pp_expected_eol);
 
-  if (!isIdentifierHead(*First)) {
+  if (!isAsciiIdentifierStart(*First)) {
     skipLine(First, End);
     return false;
   }
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 05b84e0..2685924 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -1062,8 +1062,8 @@ StringRef Lexer::getImmediateMacroNameForDiagnostics(
   return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
 }
 
-bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
-  return isIdentifierBody(c, LangOpts.DollarIdents);
+bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) {
+  return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
 }
 
 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
@@ -1712,103 +1712,128 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
   return true;
 }
 
-bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
-  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
-  unsigned Size;
-  unsigned char C = *CurPtr++;
-  while (isIdentifierBody(C))
-    C = *CurPtr++;
-
-  --CurPtr;   // Back up over the skipped character.
-
-  // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
-  // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
-  //
-  // TODO: Could merge these checks into an InfoTable flag to make the
-  // comparison cheaper
-  if (isASCII(C) && C != '\\' && C != '?' &&
-      (C != '$' || !LangOpts.DollarIdents)) {
-FinishIdentifier:
-    const char *IdStart = BufferPtr;
-    FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
-    Result.setRawIdentifierData(IdStart);
-
-    // If we are in raw mode, return this identifier raw.  There is no need to
-    // look up identifier information or attempt to macro expand it.
-    if (LexingRawMode)
-      return true;
-
-    // Fill in Result.IdentifierInfo and update the token kind,
-    // looking up the identifier in the identifier table.
-    IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
-    // Note that we have to call PP->LookUpIdentifierInfo() even for code
-    // completion, it writes IdentifierInfo into Result, and callers rely on it.
-
-    // If the completion point is at the end of an identifier, we want to treat
-    // the identifier as incomplete even if it resolves to a macro or a keyword.
-    // This allows e.g. 'class^' to complete to 'classifier'.
-    if (isCodeCompletionPoint(CurPtr)) {
-      // Return the code-completion token.
-      Result.setKind(tok::code_completion);
-      // Skip the code-completion char and all immediate identifier characters.
-      // This ensures we get consistent behavior when completing at any point in
-      // an identifier (i.e. at the start, in the middle, at the end). Note that
-      // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
-      // simpler.
-      assert(*CurPtr == 0 && "Completion character must be 0");
-      ++CurPtr;
-      // Note that code completion token is not added as a separate character
-      // when the completion point is at the end of the buffer. Therefore, we need
-      // to check if the buffer has ended.
-      if (CurPtr < BufferEnd) {
-        while (isIdentifierBody(*CurPtr))
-          ++CurPtr;
-      }
-      BufferPtr = CurPtr;
-      return true;
+bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
+                                      const char *CurPtr) {
+  if (isAllowedInitiallyIDChar(C, LangOpts)) {
+    if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
+        !PP->isPreprocessedOutput()) {
+      maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
+                                makeCharRange(*this, BufferPtr, CurPtr),
+                                /*IsFirst=*/true);
+      maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
+                                 makeCharRange(*this, BufferPtr, CurPtr));
     }
 
-    // Finally, now that we know we have an identifier, pass this off to the
-    // preprocessor, which may macro expand it or something.
-    if (II->isHandleIdentifierCase())
-      return PP->HandleIdentifier(Result);
+    MIOpt.ReadToken();
+    return LexIdentifierContinue(Result, CurPtr);
+  }
 
-    return true;
+  if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
+      !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
+      !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) {
+    // Non-ASCII characters tend to creep into source code unintentionally.
+    // Instead of letting the parser complain about the unknown token,
+    // just drop the character.
+    // Note that we can /only/ do this when the non-ASCII character is actually
+    // spelled as Unicode, not written as a UCN. The standard requires that
+    // we not throw away any possible preprocessor tokens, but there's a
+    // loophole in the mapping of Unicode characters to basic character set
+    // characters that allows us to map these particular characters to, say,
+    // whitespace.
+    diagnoseInvalidUnicodeCodepointInIdentifier(
+        PP->getDiagnostics(), LangOpts, C,
+        makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
+    BufferPtr = CurPtr;
+    return false;
   }
 
-  // Otherwise, $,\,? in identifier found.  Enter slower path.
+  // Otherwise, we have an explicit UCN or a character that's unlikely to show
+  // up by accident.
+  MIOpt.ReadToken();
+  FormTokenWithChars(Result, CurPtr, tok::unknown);
+  return true;
+}
 
-  C = getCharAndSize(CurPtr, Size);
+bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
+  // Match [_A-Za-z0-9]*, we have already matched an identifier start.
   while (true) {
+    unsigned char C = *CurPtr;
+    // Fast path.
+    if (isAsciiIdentifierContinue(C)) {
+      ++CurPtr;
+      continue;
+    }
+
+    unsigned Size;
+    // Slow path: handle trigraph, unicode codepoints, UCNs.
+    C = getCharAndSize(CurPtr, Size);
+    if (isAsciiIdentifierContinue(C)) {
+      CurPtr = ConsumeChar(CurPtr, Size, Result);
+      continue;
+    }
     if (C == '$') {
       // If we hit a $ and they are not supported in identifiers, we are done.
-      if (!LangOpts.DollarIdents) goto FinishIdentifier;
-
+      if (!LangOpts.DollarIdents)
+        break;
       // Otherwise, emit a diagnostic and continue.
       if (!isLexingRawMode())
         Diag(CurPtr, diag::ext_dollar_in_identifier);
       CurPtr = ConsumeChar(CurPtr, Size, Result);
-      C = getCharAndSize(CurPtr, Size);
       continue;
-    } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
-      C = getCharAndSize(CurPtr, Size);
+    }
+    if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
       continue;
-    } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
-      C = getCharAndSize(CurPtr, Size);
+    if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
       continue;
-    } else if (!isIdentifierBody(C)) {
-      goto FinishIdentifier;
-    }
+    // Neither an expected Unicode codepoint nor a UCN.
+    break;
+  }
 
-    // Otherwise, this character is good, consume it.
-    CurPtr = ConsumeChar(CurPtr, Size, Result);
+  const char *IdStart = BufferPtr;
+  FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
+  Result.setRawIdentifierData(IdStart);
 
-    C = getCharAndSize(CurPtr, Size);
-    while (isIdentifierBody(C)) {
-      CurPtr = ConsumeChar(CurPtr, Size, Result);
-      C = getCharAndSize(CurPtr, Size);
+  // If we are in raw mode, return this identifier raw.  There is no need to
+  // look up identifier information or attempt to macro expand it.
+  if (LexingRawMode)
+    return true;
+
+  // Fill in Result.IdentifierInfo and update the token kind,
+  // looking up the identifier in the identifier table.
+  IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
+  // Note that we have to call PP->LookUpIdentifierInfo() even for code
+  // completion, it writes IdentifierInfo into Result, and callers rely on it.
+
+  // If the completion point is at the end of an identifier, we want to treat
+  // the identifier as incomplete even if it resolves to a macro or a keyword.
+  // This allows e.g. 'class^' to complete to 'classifier'.
+  if (isCodeCompletionPoint(CurPtr)) {
+    // Return the code-completion token.
+    Result.setKind(tok::code_completion);
+    // Skip the code-completion char and all immediate identifier characters.
+    // This ensures we get consistent behavior when completing at any point in
+    // an identifier (i.e. at the start, in the middle, at the end). Note that
+    // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
+    // simpler.
+    assert(*CurPtr == 0 && "Completion character must be 0");
+    ++CurPtr;
+    // Note that code completion token is not added as a separate character
+    // when the completion point is at the end of the buffer. Therefore, we need
+    // to check if the buffer has ended.
+    if (CurPtr < BufferEnd) {
+      while (isAsciiIdentifierContinue(*CurPtr))
+        ++CurPtr;
     }
+    BufferPtr = CurPtr;
+    return true;
   }
+
+  // Finally, now that we know we have an identifier, pass this off to the
+  // preprocessor, which may macro expand it or something.
+  if (II->isHandleIdentifierCase())
+    return PP->HandleIdentifier(Result);
+
+  return true;
 }
 
 /// isHexaLiteral - Return true if Start points to a hex constant.
@@ -1864,7 +1889,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
   if (C == '\'' && (getLangOpts().CPlusPlus14 || getLangOpts().C2x)) {
     unsigned NextSize;
     char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
-    if (isIdentifierBody(Next)) {
+    if (isAsciiIdentifierContinue(Next)) {
       if (!isLexingRawMode())
         Diag(CurPtr, getLangOpts().CPlusPlus
                          ? diag::warn_cxx11_compat_digit_separator
@@ -1899,7 +1924,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
   char C = getCharAndSize(CurPtr, Size);
   bool Consumed = false;
 
-  if (!isIdentifierHead(C)) {
+  if (!isAsciiIdentifierStart(C)) {
     if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
       Consumed = true;
     else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
@@ -1938,7 +1963,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
         unsigned NextSize;
         char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
                                          getLangOpts());
-        if (!isIdentifierBody(Next)) {
+        if (!isAsciiIdentifierContinue(Next)) {
           // End of suffix. Check whether this is on the allowed list.
           const StringRef CompleteSuffix(Buffer, Chars);
           IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
@@ -1970,10 +1995,12 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
   Result.setFlag(Token::HasUDSuffix);
   while (true) {
     C = getCharAndSize(CurPtr, Size);
-    if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
-    else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
-    else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
-    else break;
+    if (isAsciiIdentifierContinue(C)) {
+      CurPtr = ConsumeChar(CurPtr, Size, Result);
+    } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
+    } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
+    } else
+      break;
   }
 
   return CurPtr;
@@ -3205,47 +3232,6 @@ bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
   return false;
 }
 
-bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
-  if (isAllowedInitiallyIDChar(C, LangOpts)) {
-    if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
-        !PP->isPreprocessedOutput()) {
-      maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
-                                makeCharRange(*this, BufferPtr, CurPtr),
-                                /*IsFirst=*/true);
-      maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
-                                 makeCharRange(*this, BufferPtr, CurPtr));
-    }
-
-    MIOpt.ReadToken();
-    return LexIdentifier(Result, CurPtr);
-  }
-
-  if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
-      !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
-      !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) {
-    // Non-ASCII characters tend to creep into source code unintentionally.
-    // Instead of letting the parser complain about the unknown token,
-    // just drop the character.
-    // Note that we can /only/ do this when the non-ASCII character is actually
-    // spelled as Unicode, not written as a UCN. The standard requires that
-    // we not throw away any possible preprocessor tokens, but there's a
-    // loophole in the mapping of Unicode characters to basic character set
-    // characters that allows us to map these particular characters to, say,
-    // whitespace.
-    diagnoseInvalidUnicodeCodepointInIdentifier(
-        PP->getDiagnostics(), LangOpts, C,
-        makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
-    BufferPtr = CurPtr;
-    return false;
-  }
-
-  // Otherwise, we have an explicit UCN or a character that's unlikely to show
-  // up by accident.
-  MIOpt.ReadToken();
-  FormTokenWithChars(Result, CurPtr, tok::unknown);
-  return true;
-}
-
 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
   IsAtStartOfLine = Result.isAtStartOfLine();
   HasLeadingSpace = Result.hasLeadingSpace();
@@ -3489,7 +3475,7 @@ LexNextToken:
     }
 
     // treat u like the start of an identifier.
-    return LexIdentifier(Result, CurPtr);
+    return LexIdentifierContinue(Result, CurPtr);
 
   case 'U':   // Identifier (Uber) or C11/C++11 UTF-32 string literal
     // Notify MIOpt that we read a non-whitespace/non-comment token.
@@ -3518,7 +3504,7 @@ LexNextToken:
     }
 
     // treat U like the start of an identifier.
-    return LexIdentifier(Result, CurPtr);
+    return LexIdentifierContinue(Result, CurPtr);
 
   case 'R': // Identifier or C++0x raw string literal
     // Notify MIOpt that we read a non-whitespace/non-comment token.
@@ -3534,7 +3520,7 @@ LexNextToken:
     }
 
     // treat R like the start of an identifier.
-    return LexIdentifier(Result, CurPtr);
+    return LexIdentifierContinue(Result, CurPtr);
 
   case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
     // Notify MIOpt that we read a non-whitespace/non-comment token.
@@ -3573,7 +3559,7 @@ LexNextToken:
   case '_':
     // Notify MIOpt that we read a non-whitespace/non-comment token.
     MIOpt.ReadToken();
-    return LexIdentifier(Result, CurPtr);
+    return LexIdentifierContinue(Result, CurPtr);
 
   case '$':   // $ in identifiers.
     if (LangOpts.DollarIdents) {
@@ -3581,7 +3567,7 @@ LexNextToken:
         Diag(CurPtr-1, diag::ext_dollar_in_identifier);
       // Notify MIOpt that we read a non-whitespace/non-comment token.
       MIOpt.ReadToken();
-      return LexIdentifier(Result, CurPtr);
+      return LexIdentifierContinue(Result, CurPtr);
     }
 
     Kind = tok::unknown;
@@ -3996,7 +3982,7 @@ LexNextToken:
           goto LexNextToken;
         }
 
-        return LexUnicode(Result, CodePoint, CurPtr);
+        return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
       }
     }
 
@@ -4028,7 +4014,7 @@ LexNextToken:
         // (We manually eliminate the tail call to avoid recursion.)
         goto LexNextToken;
       }
-      return LexUnicode(Result, CodePoint, CurPtr);
+      return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
     }
 
     if (isLexingRawMode() || ParsingPreprocessorDirective ||
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index f9af7c2..8475417 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -338,7 +338,7 @@ static StringRef sanitizeFilenameAsIdentifier(StringRef Name,
   if (Name.empty())
     return Name;
 
-  if (!isValidIdentifier(Name)) {
+  if (!isValidAsciiIdentifier(Name)) {
     // If we don't already have something with the form of an identifier,
     // create a buffer with the sanitized name.
     Buffer.clear();
@@ -346,7 +346,7 @@ static StringRef sanitizeFilenameAsIdentifier(StringRef Name,
       Buffer.push_back('_');
     Buffer.reserve(Buffer.size() + Name.size());
     for (unsigned I = 0, N = Name.size(); I != N; ++I) {
-      if (isIdentifierBody(Name[I]))
+      if (isAsciiIdentifierContinue(Name[I]))
         Buffer.push_back(Name[I]);
       else
         Buffer.push_back('_');
author	Corentin Jabot <corentin.jabot@gmail.com>	2021-09-14 09:11:23 -0400
committer	Aaron Ballman <aaron@aaronballman.com>	2021-09-14 09:12:22 -0400
commit	601102d282d5e9a1429fea52ee17303aec8a7c10 (patch)
tree	524c34d1746fc2a62524f8ec2650c7f912196810 /clang/lib/Lex
parent	9bbc0c1ffb47f9cf4c9d8e9a0e8100002fe5aafb (diff)
download	llvm-601102d282d5e9a1429fea52ee17303aec8a7c10.zip llvm-601102d282d5e9a1429fea52ee17303aec8a7c10.tar.gz llvm-601102d282d5e9a1429fea52ee17303aec8a7c10.tar.bz2