Reland [clang][Sema, Lex, Parse] Preprocessor embed in C and C++ (#95802)

This commit implements the entirety of the now-accepted [N3017 -Preprocessor Embed](https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3017.htm) and its sister C++ paper [p1967](https://wg21.link/p1967). It implements everything in the specification, and includes an implementation that drastically improves the time it takes to embed data in specific scenarios (the initialization of character type arrays). The mechanisms used to do this are used under the "as-if" rule, and in general when the system cannot detect it is initializing an array object in a variable declaration, will generate EmbedExpr AST node which will be expanded by AST consumers (CodeGen or constant expression evaluators) or expand embed directive as a comma expression. This reverts commit https://github.com/llvm/llvm-project/commit/682d461d5a231cee54d65910e6341769419a67d7. --------- Co-authored-by: The Phantom Derpstorm <phdofthehouse@gmail.com> Co-authored-by: Aaron Ballman <aaron@aaronballman.com> Co-authored-by: cor3ntin <corentinjabot@gmail.com> Co-authored-by: H. Vetinari <h.vetinari@gmx.com>
author: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com> 2024-06-20 14:38:46 +0200
committer: GitHub <noreply@github.com> 2024-06-20 14:38:46 +0200
commit: 41c6e4379204ffc00948edd33d59ba5ebbceaba2 (patch)
tree: b8508b8f0e7f108d1f6759922f49f367bd24fb00 /clang/lib/Frontend/PrintPreprocessedOutput.cpp
parent: af82e63c28f67bf61a9b2b0e64bc55be4acf520e (diff)
download: llvm-41c6e4379204ffc00948edd33d59ba5ebbceaba2.zip
llvm-41c6e4379204ffc00948edd33d59ba5ebbceaba2.tar.gz
llvm-41c6e4379204ffc00948edd33d59ba5ebbceaba2.tar.bz2
1 files changed, 115 insertions, 7 deletions
diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
index a26d2c3..0592423 100644
--- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -11,11 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Frontend/Utils.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Frontend/PreprocessorOutputOptions.h"
+#include "clang/Frontend/Utils.h"
 #include "clang/Lex/MacroInfo.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/Pragma.h"
@@ -93,6 +93,7 @@ private:
   bool DisableLineMarkers;
   bool DumpDefines;
   bool DumpIncludeDirectives;
+  bool DumpEmbedDirectives;
   bool UseLineDirectives;
   bool IsFirstFileEntered;
   bool MinimizeWhitespace;
@@ -100,6 +101,7 @@ private:
   bool KeepSystemIncludes;
   raw_ostream *OrigOS;
   std::unique_ptr<llvm::raw_null_ostream> NullOS;
+  unsigned NumToksToSkip;
 
   Token PrevTok;
   Token PrevPrevTok;
@@ -107,14 +109,16 @@ private:
 public:
   PrintPPOutputPPCallbacks(Preprocessor &pp, raw_ostream *os, bool lineMarkers,
                            bool defines, bool DumpIncludeDirectives,
-                           bool UseLineDirectives, bool MinimizeWhitespace,
-                           bool DirectivesOnly, bool KeepSystemIncludes)
+                           bool DumpEmbedDirectives, bool UseLineDirectives,
+                           bool MinimizeWhitespace, bool DirectivesOnly,
+                           bool KeepSystemIncludes)
       : PP(pp), SM(PP.getSourceManager()), ConcatInfo(PP), OS(os),
         DisableLineMarkers(lineMarkers), DumpDefines(defines),
         DumpIncludeDirectives(DumpIncludeDirectives),
+        DumpEmbedDirectives(DumpEmbedDirectives),
         UseLineDirectives(UseLineDirectives),
         MinimizeWhitespace(MinimizeWhitespace), DirectivesOnly(DirectivesOnly),
-        KeepSystemIncludes(KeepSystemIncludes), OrigOS(os) {
+        KeepSystemIncludes(KeepSystemIncludes), OrigOS(os), NumToksToSkip(0) {
     CurLine = 0;
     CurFilename += "<uninit>";
     EmittedTokensOnThisLine = false;
@@ -129,6 +133,10 @@ public:
     PrevPrevTok.startToken();
   }
 
+  /// Returns true if #embed directives should be expanded into a comma-
+  /// delimited list of integer constants or not.
+  bool expandEmbedContents() const { return !DumpEmbedDirectives; }
+
   bool isMinimizeWhitespace() const { return MinimizeWhitespace; }
 
   void setEmittedTokensOnThisLine() { EmittedTokensOnThisLine = true; }
@@ -149,6 +157,9 @@ public:
   void FileChanged(SourceLocation Loc, FileChangeReason Reason,
                    SrcMgr::CharacteristicKind FileType,
                    FileID PrevFID) override;
+  void EmbedDirective(SourceLocation HashLoc, StringRef FileName, bool IsAngled,
+                      OptionalFileEntryRef File,
+                      const LexEmbedParametersResult &Params) override;
   void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok,
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
@@ -232,6 +243,9 @@ public:
 
   void BeginModule(const Module *M);
   void EndModule(const Module *M);
+
+  unsigned GetNumToksToSkip() const { return NumToksToSkip; }
+  void ResetSkipToks() { NumToksToSkip = 0; }
 };
 }  // end anonymous namespace
 
@@ -399,6 +413,74 @@ void PrintPPOutputPPCallbacks::FileChanged(SourceLocation Loc,
   }
 }
 
+void PrintPPOutputPPCallbacks::EmbedDirective(
+    SourceLocation HashLoc, StringRef FileName, bool IsAngled,
+    OptionalFileEntryRef File, const LexEmbedParametersResult &Params) {
+  if (!DumpEmbedDirectives)
+    return;
+
+  // The EmbedDirective() callback is called before we produce the annotation
+  // token stream for the directive. We skip printing the annotation tokens
+  // within PrintPreprocessedTokens(), but we also need to skip the prefix,
+  // suffix, and if_empty tokens as those are inserted directly into the token
+  // stream and would otherwise be printed immediately after printing the
+  // #embed directive.
+  //
+  // FIXME: counting tokens to skip is a kludge but we have no way to know
+  // which tokens were inserted as part of the embed and which ones were
+  // explicitly written by the user.
+  MoveToLine(HashLoc, /*RequireStartOfLine=*/true);
+  *OS << "#embed " << (IsAngled ? '<' : '"') << FileName
+      << (IsAngled ? '>' : '"');
+
+  auto PrintToks = [&](llvm::ArrayRef<Token> Toks) {
+    SmallString<128> SpellingBuffer;
+    for (const Token &T : Toks) {
+      if (T.hasLeadingSpace())
+        *OS << " ";
+      *OS << PP.getSpelling(T, SpellingBuffer);
+    }
+  };
+  bool SkipAnnotToks = true;
+  if (Params.MaybeIfEmptyParam) {
+    *OS << " if_empty(";
+    PrintToks(Params.MaybeIfEmptyParam->Tokens);
+    *OS << ")";
+    // If the file is empty, we can skip those tokens. If the file is not
+    // empty, we skip the annotation tokens.
+    if (File && !File->getSize()) {
+      NumToksToSkip += Params.MaybeIfEmptyParam->Tokens.size();
+      SkipAnnotToks = false;
+    }
+  }
+
+  if (Params.MaybeLimitParam) {
+    *OS << " limit(" << Params.MaybeLimitParam->Limit << ")";
+  }
+  if (Params.MaybeOffsetParam) {
+    *OS << " clang::offset(" << Params.MaybeOffsetParam->Offset << ")";
+  }
+  if (Params.MaybePrefixParam) {
+    *OS << " prefix(";
+    PrintToks(Params.MaybePrefixParam->Tokens);
+    *OS << ")";
+    NumToksToSkip += Params.MaybePrefixParam->Tokens.size();
+  }
+  if (Params.MaybeSuffixParam) {
+    *OS << " suffix(";
+    PrintToks(Params.MaybeSuffixParam->Tokens);
+    *OS << ")";
+    NumToksToSkip += Params.MaybeSuffixParam->Tokens.size();
+  }
+
+  // We may need to skip the annotation token.
+  if (SkipAnnotToks)
+    NumToksToSkip++;
+
+  *OS << " /* clang -E -dE */";
+  setEmittedDirectiveOnThisLine();
+}
+
 void PrintPPOutputPPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
@@ -678,7 +760,7 @@ void PrintPPOutputPPCallbacks::HandleWhitespaceBeforeTok(const Token &Tok,
   if (Tok.is(tok::eof) ||
       (Tok.isAnnotation() && !Tok.is(tok::annot_header_unit) &&
        !Tok.is(tok::annot_module_begin) && !Tok.is(tok::annot_module_end) &&
-       !Tok.is(tok::annot_repl_input_end)))
+       !Tok.is(tok::annot_repl_input_end) && !Tok.is(tok::annot_embed)))
     return;
 
   // EmittedDirectiveOnThisLine takes priority over RequireSameLine.
@@ -878,6 +960,27 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
       std::string Name = M->getFullModuleName();
       Callbacks->OS->write(Name.data(), Name.size());
       Callbacks->HandleNewlinesInToken(Name.data(), Name.size());
+    } else if (Tok.is(tok::annot_embed)) {
+      // Manually explode the binary data out to a stream of comma-delimited
+      // integer values. If the user passed -dE, that is handled by the
+      // EmbedDirective() callback. We should only get here if the user did not
+      // pass -dE.
+      assert(Callbacks->expandEmbedContents() &&
+             "did not expect an embed annotation");
+      auto *Data =
+          reinterpret_cast<EmbedAnnotationData *>(Tok.getAnnotationValue());
+
+      // Loop over the contents and print them as a comma-delimited list of
+      // values.
+      bool PrintComma = false;
+      for (auto Iter = Data->BinaryData.begin(), End = Data->BinaryData.end();
+           Iter != End; ++Iter) {
+        if (PrintComma)
+          *Callbacks->OS << ", ";
+        *Callbacks->OS << static_cast<unsigned>(*Iter);
+        PrintComma = true;
+      }
+      IsStartOfLine = true;
     } else if (Tok.isAnnotation()) {
       // Ignore annotation tokens created by pragmas - the pragmas themselves
       // will be reproduced in the preprocessed output.
@@ -926,6 +1029,10 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
     if (Tok.is(tok::eof)) break;
 
     PP.Lex(Tok);
+    // If lexing that token causes us to need to skip future tokens, do so now.
+    for (unsigned I = 0, Skip = Callbacks->GetNumToksToSkip(); I < Skip; ++I)
+      PP.Lex(Tok);
+    Callbacks->ResetSkipToks();
   }
 }
 
@@ -982,8 +1089,9 @@ void clang::DoPrintPreprocessedInput(Preprocessor &PP, raw_ostream *OS,
 
   PrintPPOutputPPCallbacks *Callbacks = new PrintPPOutputPPCallbacks(
       PP, OS, !Opts.ShowLineMarkers, Opts.ShowMacros,
-      Opts.ShowIncludeDirectives, Opts.UseLineDirectives,
-      Opts.MinimizeWhitespace, Opts.DirectivesOnly, Opts.KeepSystemIncludes);
+      Opts.ShowIncludeDirectives, Opts.ShowEmbedDirectives,
+      Opts.UseLineDirectives, Opts.MinimizeWhitespace, Opts.DirectivesOnly,
+      Opts.KeepSystemIncludes);
 
   // Expand macros in pragmas with -fms-extensions.  The assumption is that
   // the majority of pragmas in such a file will be Microsoft pragmas.
author	Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>	2024-06-20 14:38:46 +0200
committer	GitHub <noreply@github.com>	2024-06-20 14:38:46 +0200
commit	41c6e4379204ffc00948edd33d59ba5ebbceaba2 (patch)
tree	b8508b8f0e7f108d1f6759922f49f367bd24fb00 /clang/lib/Frontend/PrintPreprocessedOutput.cpp
parent	af82e63c28f67bf61a9b2b0e64bc55be4acf520e (diff)
download	llvm-41c6e4379204ffc00948edd33d59ba5ebbceaba2.zip llvm-41c6e4379204ffc00948edd33d59ba5ebbceaba2.tar.gz llvm-41c6e4379204ffc00948edd33d59ba5ebbceaba2.tar.bz2