[𝘀𝗽𝗿] initial versionusers/shawbyoung/spr/updated-commandlineargumentreferencemd

Created using spr 1.3.4
author: shawbyoung <shawbyoung@gmail.com> 2024-06-11 11:31:37 -0700
committer: shawbyoung <shawbyoung@gmail.com> 2024-06-11 11:31:37 -0700
commit: 2289286c92696109e2d63a977eb845ea4221f199 (patch)
tree: 90e10f09ae908b875e0a3b4eccdd4c90d8d18989
parent: 338cbfef03e0ab58d7b52f3301928c58b194a1b4 (diff)
parent: 5a183532e960444088aa4d76d4686d7c93ba79b9 (diff)
download: llvm-users/shawbyoung/spr/updated-commandlineargumentreferencemd.zip
llvm-users/shawbyoung/spr/updated-commandlineargumentreferencemd.tar.gz
llvm-users/shawbyoung/spr/updated-commandlineargumentreferencemd.tar.bz2
25 files changed, 540 insertions, 85 deletions
diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 8887d1f5..bdc1d9df 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -614,6 +614,17 @@
 
 - `--lite-threshold-pct=<uint>`
 
+  Threshold (in percent) of matched profile at which stale profile inference is
+  applied to functions. Argument corresponds to the sum of matched execution
+  counts of function blocks divided by the sum of execution counts of function
+  blocks. E.g if the sum of a function blocks' execution counts is 100, the sum
+  of the function blocks' matched execution counts is 10, and the argument is 15
+  (15%), profile inference will not be applied to that function. A higher
+  threshold will correlate with fewer functions to process in cases of stale
+  profile. Default set to %5.
+
+- `--matched-profile-threshold=<uint>`
+
   Threshold (in percent) for selecting functions to process in lite mode. Higher
   threshold means fewer functions to process. E.g threshold of 90 means only top
   10 percent of functions with profile will be processed.
@@ -1161,4 +1172,4 @@
 
 - `--print-options`
 
-  Print non-default options after command line parsing
-\ No newline at end of file
+  Print non-default options after command line parsing
diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index 8dec32de..3cc9d82 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -12,6 +12,7 @@
 #include "bolt/Core/DIEBuilder.h"
 #include "bolt/Core/DebugData.h"
 #include "bolt/Core/DebugNames.h"
+#include "bolt/Core/GDBIndex.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/DWP/DWP.h"
@@ -131,7 +132,8 @@ private:
   makeFinalLocListsSection(DWARFVersion Version);
 
   /// Finalize type sections in the main binary.
-  CUOffsetMap finalizeTypeSections(DIEBuilder &DIEBlder, DIEStreamer &Streamer);
+  CUOffsetMap finalizeTypeSections(DIEBuilder &DIEBlder, DIEStreamer &Streamer,
+                                   GDBIndex &GDBIndexSection);
 
   /// Process and write out CUs that are passsed in.
   void finalizeCompileUnits(DIEBuilder &DIEBlder, DIEStreamer &Streamer,
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 5793963..c231fff 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -487,6 +487,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF,
         // This assumes the second instruction in the macro-op pair will get
         // assigned to its own MCRelaxableFragment. Since all JCC instructions
         // are relaxable, we should be safe.
+        Streamer.emitNeverAlignCodeAtEnd(/*Alignment to avoid=*/64, *BC.STI);
       }
 
       if (!EmitCodeOnly) {
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 365bc53..41afa6b 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -51,6 +51,12 @@ cl::opt<bool>
                       cl::desc("Infer counts from stale profile data."),
                       cl::init(false), cl::Hidden, cl::cat(BoltOptCategory));
 
+cl::opt<unsigned> MatchedProfileThreshold(
+    "matched-profile-threshold",
+    cl::desc("Percentage threshold of matched execution counts at which stale "
+             "profile inference is executed."),
+    cl::init(5), cl::Hidden, cl::cat(BoltOptCategory));
+
 cl::opt<unsigned> StaleMatchingMaxFuncSize(
     "stale-matching-max-func-size",
     cl::desc("The maximum size of a function to consider for inference."),
@@ -309,23 +315,29 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
   FlowFunction Func;
 
   // Add a special "dummy" source so that there is always a unique entry point.
-  // Because of the extra source, for all other blocks in FlowFunction it holds
-  // that Block.Index == BB->getIndex() + 1
   FlowBlock EntryBlock;
   EntryBlock.Index = 0;
   Func.Blocks.push_back(EntryBlock);
 
-  // Create FlowBlock for every basic block in the binary function
+  // Create FlowBlock for every basic block in the binary function.
   for (const BinaryBasicBlock *BB : BlockOrder) {
     Func.Blocks.emplace_back();
     FlowBlock &Block = Func.Blocks.back();
     Block.Index = Func.Blocks.size() - 1;
+    if (BB->successors().empty())
+      Block.IsExit = true;
     (void)BB;
     assert(Block.Index == BB->getIndex() + 1 &&
            "incorrectly assigned basic block index");
   }
 
-  // Create FlowJump for each jump between basic blocks in the binary function
+  // Add a special "dummy" sink block so there is always a unique sink.
+  FlowBlock SinkBlock;
+  SinkBlock.Index = Func.Blocks.size();
+  Func.Blocks.push_back(SinkBlock);
+  Func.Sink = SinkBlock.Index;
+
+  // Create FlowJump for each jump between basic blocks in the binary function.
   std::vector<uint64_t> InDegree(Func.Blocks.size(), 0);
   for (const BinaryBasicBlock *SrcBB : BlockOrder) {
     std::unordered_set<const BinaryBasicBlock *> UniqueSuccs;
@@ -358,9 +370,9 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
   }
 
   // Add dummy edges to the extra sources. If there are multiple entry blocks,
-  // add an unlikely edge from 0 to the subsequent ones
+  // add an unlikely edge from 0 to the subsequent ones. Skips the sink block.
   assert(InDegree[0] == 0 && "dummy entry blocks shouldn't have predecessors");
-  for (uint64_t I = 1; I < Func.Blocks.size(); I++) {
+  for (uint64_t I = 1; I < Func.Blocks.size() - 1; I++) {
     const BinaryBasicBlock *BB = BlockOrder[I - 1];
     if (BB->isEntryPoint() || InDegree[I] == 0) {
       Func.Jumps.emplace_back();
@@ -372,6 +384,17 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
     }
   }
 
+  // Add dummy edges from the exit blocks to the sink block.
+  for (uint64_t I = 1; I < BlockOrder.size() + 1; I++) {
+    if (!Func.Blocks[I].IsExit)
+      continue;
+
+    Func.Jumps.emplace_back();
+    FlowJump &Jump = Func.Jumps.back();
+    Jump.Source = I;
+    Jump.Target = Func.Sink;
+  }
+
   // Create necessary metadata for the flow function
   for (FlowJump &Jump : Func.Jumps) {
     assert(Jump.Source < Func.Blocks.size());
@@ -395,7 +418,7 @@ void matchWeightsByHashes(BinaryContext &BC,
                           const BinaryFunction::BasicBlockOrderType &BlockOrder,
                           const yaml::bolt::BinaryFunctionProfile &YamlBF,
                           FlowFunction &Func) {
-  assert(Func.Blocks.size() == BlockOrder.size() + 1);
+  assert(Func.Blocks.size() == BlockOrder.size() + 2);
 
   std::vector<FlowBlock *> Blocks;
   std::vector<BlendedBlockHash> BlendedHashes;
@@ -434,6 +457,7 @@ void matchWeightsByHashes(BinaryContext &BC,
       if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) {
         ++BC.Stats.NumMatchedBlocks;
         BC.Stats.MatchedSampleCount += YamlBB.ExecCount;
+        Func.MatchedExecCount += YamlBB.ExecCount;
         LLVM_DEBUG(dbgs() << "  exact match\n");
       } else {
         LLVM_DEBUG(dbgs() << "  loose match\n");
@@ -575,10 +599,15 @@ void preprocessUnreachableBlocks(FlowFunction &Func) {
 /// Decide if stale profile matching can be applied for a given function.
 /// Currently we skip inference for (very) large instances and for instances
 /// having "unexpected" control flow (e.g., having no sink basic blocks).
-bool canApplyInference(const FlowFunction &Func) {
+bool canApplyInference(const FlowFunction &Func,
+                       const yaml::bolt::BinaryFunctionProfile &YamlBF) {
   if (Func.Blocks.size() > opts::StaleMatchingMaxFuncSize)
     return false;
 
+  if (Func.MatchedExecCount / YamlBF.ExecCount >=
+      opts::MatchedProfileThreshold / 100)
+    return false;
+
   bool HasExitBlocks = llvm::any_of(
       Func.Blocks, [&](const FlowBlock &Block) { return Block.isExit(); });
   if (!HasExitBlocks)
@@ -618,7 +647,7 @@ void assignProfile(BinaryFunction &BF,
                    FlowFunction &Func) {
   BinaryContext &BC = BF.getBinaryContext();
 
-  assert(Func.Blocks.size() == BlockOrder.size() + 1);
+  assert(Func.Blocks.size() == BlockOrder.size() + 2);
   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
     FlowBlock &Block = Func.Blocks[I + 1];
     BinaryBasicBlock *BB = BlockOrder[I];
@@ -640,6 +669,9 @@ void assignProfile(BinaryFunction &BF,
       if (Jump->Flow == 0)
         continue;
 
+      // Skip the artificial sink block
+      if (Jump->Target == Func.Sink)
+        continue;
       BinaryBasicBlock &SuccBB = *BlockOrder[Jump->Target - 1];
       // Check if the edge corresponds to a regular jump or a landing pad
       if (BB->getSuccessor(SuccBB.getLabel())) {
@@ -736,7 +768,7 @@ bool YAMLProfileReader::inferStaleProfile(
   preprocessUnreachableBlocks(Func);
 
   // Check if profile inference can be applied for the instance.
-  if (!canApplyInference(Func))
+  if (!canApplyInference(Func, YamlBF))
     return false;
 
   // Apply the profile inference algorithm.
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 8814ebb..7b62999 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -185,6 +185,7 @@ namespace bolt {
 class DIEStreamer : public DwarfStreamer {
   DIEBuilder *DIEBldr;
   DWARFRewriter &Rewriter;
+  GDBIndex &GDBIndexSection;
 
 private:
   /// Emit the compilation unit header for \p Unit in the debug_info
@@ -247,7 +248,7 @@ private:
     const uint64_t TypeSignature = cast<DWARFTypeUnit>(Unit).getTypeHash();
     DIE *TypeDIE = DIEBldr->getTypeDIE(Unit);
     const DIEBuilder::DWARFUnitInfo &UI = DIEBldr->getUnitInfoByDwarfUnit(Unit);
-    Rewriter.addGDBTypeUnitEntry(
+    GDBIndexSection.addGDBTypeUnitEntry(
         {UI.UnitOffset, TypeSignature, TypeDIE->getOffset()});
     if (Unit.getVersion() < 5) {
       // Switch the section to .debug_types section.
@@ -279,11 +280,12 @@ private:
 
 public:
   DIEStreamer(DIEBuilder *DIEBldr, DWARFRewriter &Rewriter,
+              GDBIndex &GDBIndexSection,
               DWARFLinkerBase::OutputFileType OutFileType,
               raw_pwrite_stream &OutFile,
               DWARFLinkerBase::MessageHandlerTy Warning)
       : DwarfStreamer(OutFileType, OutFile, Warning), DIEBldr(DIEBldr),
-        Rewriter(Rewriter){};
+        Rewriter(Rewriter), GDBIndexSection(GDBIndexSection) {};
 
   using DwarfStreamer::emitCompileUnitHeader;
 
@@ -326,12 +328,11 @@ static cl::opt<bool> KeepARanges(
         "keep or generate .debug_aranges section if .gdb_index is written"),
     cl::Hidden, cl::cat(BoltCategory));
 
-static cl::opt<bool>
-DeterministicDebugInfo("deterministic-debuginfo",
-  cl::desc("disables parallel execution of tasks that may produce "
-           "nondeterministic debug info"),
-  cl::init(true),
-  cl::cat(BoltCategory));
+static cl::opt<bool> DeterministicDebugInfo(
+    "deterministic-debuginfo",
+    cl::desc("disables parallel execution of tasks that may produce "
+             "nondeterministic debug info"),
+    cl::init(true), cl::cat(BoltCategory));
 
 static cl::opt<std::string> DwarfOutputPath(
     "dwarf-output-path",
@@ -460,10 +461,11 @@ static std::optional<uint64_t> getAsAddress(const DWARFUnit &DU,
 static std::unique_ptr<DIEStreamer>
 createDIEStreamer(const Triple &TheTriple, raw_pwrite_stream &OutFile,
                   StringRef Swift5ReflectionSegmentName, DIEBuilder &DIEBldr,
-                  DWARFRewriter &Rewriter) {
+                  DWARFRewriter &Rewriter, GDBIndex &GDBIndexSection) {
 
   std::unique_ptr<DIEStreamer> Streamer = std::make_unique<DIEStreamer>(
-      &DIEBldr, Rewriter, DWARFLinkerBase::OutputFileType::Object, OutFile,
+      &DIEBldr, Rewriter, GDBIndexSection,
+      DWARFLinkerBase::OutputFileType::Object, OutFile,
       [&](const Twine &Warning, StringRef Context, const DWARFDie *) {});
   Error Err = Streamer->init(TheTriple, Swift5ReflectionSegmentName);
   if (Err)
@@ -484,13 +486,12 @@ emitUnit(DIEBuilder &DIEBldr, DIEStreamer &Streamer, DWARFUnit &Unit) {
   return {U.UnitOffset, U.UnitLength, TypeHash};
 }
 
-static void emitDWOBuilder(const std::string &DWOName,
-                           DIEBuilder &DWODIEBuilder, DWARFRewriter &Rewriter,
-                           DWARFUnit &SplitCU, DWARFUnit &CU,
-                           DWARFRewriter::DWPState &State,
-                           DebugLocWriter &LocWriter,
-                           DebugStrOffsetsWriter &StrOffstsWriter,
-                           DebugStrWriter &StrWriter) {
+static void
+emitDWOBuilder(const std::string &DWOName, DIEBuilder &DWODIEBuilder,
+               DWARFRewriter &Rewriter, DWARFUnit &SplitCU, DWARFUnit &CU,
+               DWARFRewriter::DWPState &State, DebugLocWriter &LocWriter,
+               DebugStrOffsetsWriter &StrOffstsWriter,
+               DebugStrWriter &StrWriter, GDBIndex &GDBIndexSection) {
   // Populate debug_info and debug_abbrev for current dwo into StringRef.
   DWODIEBuilder.generateAbbrevs();
   DWODIEBuilder.finish();
@@ -500,8 +501,9 @@ static void emitDWOBuilder(const std::string &DWOName,
       std::make_shared<raw_svector_ostream>(OutBuffer);
   const object::ObjectFile *File = SplitCU.getContext().getDWARFObj().getFile();
   auto TheTriple = std::make_unique<Triple>(File->makeTriple());
-  std::unique_ptr<DIEStreamer> Streamer = createDIEStreamer(
-      *TheTriple, *ObjOS, "DwoStreamerInitAug2", DWODIEBuilder, Rewriter);
+  std::unique_ptr<DIEStreamer> Streamer =
+      createDIEStreamer(*TheTriple, *ObjOS, "DwoStreamerInitAug2",
+                        DWODIEBuilder, Rewriter, GDBIndexSection);
   DWARFRewriter::UnitMetaVectorType TUMetaVector;
   DWARFRewriter::UnitMeta CUMI = {0, 0, 0};
   if (SplitCU.getContext().getMaxDWOVersion() >= 5) {
@@ -652,6 +654,7 @@ void DWARFRewriter::updateDebugInfo() {
 
   DWARF5AcceleratorTable DebugNamesTable(opts::CreateDebugNames, BC,
                                          *StrWriter);
+  GDBIndex GDBIndexSection(BC);
   DWPState State;
   if (opts::WriteDWP)
     initDWPState(State);
@@ -704,7 +707,8 @@ void DWARFRewriter::updateDebugInfo() {
         TempRangesSectionWriter->finalizeSection();
 
       emitDWOBuilder(DWOName, DWODIEBuilder, *this, **SplitCU, *Unit, State,
-                     DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter);
+                     DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
+                     GDBIndexSection);
     }
 
     if (Unit->getVersion() >= 5) {
@@ -729,9 +733,10 @@ void DWARFRewriter::updateDebugInfo() {
       std::make_unique<raw_svector_ostream>(OutBuffer);
   const object::ObjectFile *File = BC.DwCtx->getDWARFObj().getFile();
   auto TheTriple = std::make_unique<Triple>(File->makeTriple());
-  std::unique_ptr<DIEStreamer> Streamer =
-      createDIEStreamer(*TheTriple, *ObjOS, "TypeStreamer", DIEBlder, *this);
-  CUOffsetMap OffsetMap = finalizeTypeSections(DIEBlder, *Streamer);
+  std::unique_ptr<DIEStreamer> Streamer = createDIEStreamer(
+      *TheTriple, *ObjOS, "TypeStreamer", DIEBlder, *this, GDBIndexSection);
+  CUOffsetMap OffsetMap =
+      finalizeTypeSections(DIEBlder, *Streamer, GDBIndexSection);
 
   const bool SingleThreadedMode =
       opts::NoThreads || opts::DeterministicDebugInfo;
@@ -761,7 +766,8 @@ void DWARFRewriter::updateDebugInfo() {
 
   finalizeDebugSections(DIEBlder, DebugNamesTable, *Streamer, *ObjOS,
                         OffsetMap);
-  updateGdbIndexSection(OffsetMap, CUIndex);
+  GDBIndexSection.updateGdbIndexSection(OffsetMap, CUIndex,
+                                        *ARangesSectionWriter);
 }
 
 void DWARFRewriter::updateUnitDebugInfo(
@@ -1429,7 +1435,8 @@ void DWARFRewriter::updateLineTableOffsets(const MCAsmLayout &Layout) {
 }
 
 CUOffsetMap DWARFRewriter::finalizeTypeSections(DIEBuilder &DIEBlder,
-                                                DIEStreamer &Streamer) {
+                                                DIEStreamer &Streamer,
+                                                GDBIndex &GDBIndexSection) {
   // update TypeUnit DW_AT_stmt_list with new .debug_line information.
   auto updateLineTable = [&](const DWARFUnit &Unit) -> void {
     DIE *UnitDIE = DIEBlder.getUnitDIEbyUnit(Unit);
@@ -1449,8 +1456,8 @@ CUOffsetMap DWARFRewriter::finalizeTypeSections(DIEBuilder &DIEBlder,
       std::make_shared<raw_svector_ostream>(OutBuffer);
   const object::ObjectFile *File = BC.DwCtx->getDWARFObj().getFile();
   auto TheTriple = std::make_unique<Triple>(File->makeTriple());
-  std::unique_ptr<DIEStreamer> TypeStreamer =
-      createDIEStreamer(*TheTriple, *ObjOS, "TypeStreamer", DIEBlder, *this);
+  std::unique_ptr<DIEStreamer> TypeStreamer = createDIEStreamer(
+      *TheTriple, *ObjOS, "TypeStreamer", DIEBlder, *this, GDBIndexSection);
 
   // generate debug_info and CUMap
   CUOffsetMap CUMap;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d44faa5..63bb867 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5483,6 +5483,10 @@ def pg : Flag<["-"], "pg">, HelpText<"Enable mcount instrumentation">,
   MarshallingInfoFlag<CodeGenOpts<"InstrumentForProfiling">>;
 def pipe : Flag<["-", "--"], "pipe">,
   HelpText<"Use pipes between commands, when possible">;
+// Facebook T92898286
+def post_link_optimize : Flag<["--"], "post-link-optimize">,
+  HelpText<"Apply post-link optimizations using BOLT">;
+// End Facebook T92898286
 def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">;
 def prebind : Flag<["-"], "prebind">;
 def preload : Flag<["-"], "preload">;
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index b141e5f..f7611af 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -672,12 +672,41 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
+  // Facebook T92898286
+  if (Args.hasArg(options::OPT_post_link_optimize))
+    CmdArgs.push_back("-q");
+  // End Facebook T92898286
+
   Args.AddAllArgs(CmdArgs, options::OPT_T);
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this,
                                          ResponseFileSupport::AtFileCurCP(),
                                          Exec, CmdArgs, Inputs, Output));
+  // Facebook T92898286
+  if (!Args.hasArg(options::OPT_post_link_optimize) || !Output.isFilename())
+    return;
+
+  const char *MvExec = Args.MakeArgString(ToolChain.GetProgramPath("mv"));
+  ArgStringList MoveCmdArgs;
+  MoveCmdArgs.push_back(Output.getFilename());
+  const char *PreBoltBin =
+      Args.MakeArgString(Twine(Output.getFilename()) + ".pre-bolt");
+  MoveCmdArgs.push_back(PreBoltBin);
+  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
+                                         MvExec, MoveCmdArgs, std::nullopt));
+
+  ArgStringList BoltCmdArgs;
+  const char *BoltExec =
+      Args.MakeArgString(ToolChain.GetProgramPath("llvm-bolt"));
+  BoltCmdArgs.push_back(PreBoltBin);
+  BoltCmdArgs.push_back("-reorder-blocks=reverse");
+  BoltCmdArgs.push_back("-update-debug-sections");
+  BoltCmdArgs.push_back("-o");
+  BoltCmdArgs.push_back(Output.getFilename());
+  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
+                                         BoltExec, BoltCmdArgs, std::nullopt));
+  // End Facebook T92898286
 }
 
 void tools::gnutools::Assembler::ConstructJob(Compilation &C,
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index 774c4ea..6196345 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -84,7 +84,13 @@ if is_msvc:
 # use_clang() and use_lld() respectively, so set them to "", if needed.
 if not hasattr(config, "clang_src_dir"):
     config.clang_src_dir = ""
-llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
+# Facebook T92898286
+should_test_bolt = get_required_attr(config, "llvm_test_bolt")
+if should_test_bolt:
+    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects), additional_flags=["--post-link-optimize"])
+else:
+    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
+# End Facebook T92898286
 
 if not hasattr(config, "lld_src_dir"):
     config.lld_src_dir = ""
@@ -293,3 +299,9 @@ llvm_config.feature_config([("--build-mode", {"Debug|RelWithDebInfo": "debug-inf
 # Allow 'REQUIRES: XXX-registered-target' in tests.
 for arch in config.targets_to_build:
     config.available_features.add(arch.lower() + "-registered-target")
+
+# Facebook T92898286
+# Ensure the user's PYTHONPATH is included.
+if "PYTHONPATH" in os.environ:
+    config.environment["PYTHONPATH"] = os.environ["PYTHONPATH"]
+# End Facebook T92898286
diff --git a/cross-project-tests/lit.site.cfg.py.in b/cross-project-tests/lit.site.cfg.py.in
index 39458df..2d53cd3 100644
--- a/cross-project-tests/lit.site.cfg.py.in
+++ b/cross-project-tests/lit.site.cfg.py.in
@@ -21,6 +21,10 @@ config.mlir_src_root = "@MLIR_SOURCE_DIR@"
 
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index d934349..d4a62c5 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -248,6 +248,11 @@ if is_configured("lldb_libs_dir"):
 if is_configured("lldb_framework_dir"):
     dotest_cmd += ["--framework", config.lldb_framework_dir]
 
+# Facebook T92898286
+if is_configured("llvm_test_bolt"):
+    dotest_cmd += ["-E", '"--post-link-optimize"']
+# End Facebook T92898286
+
 if (
     "lldb-repro-capture" in config.available_features
     or "lldb-repro-replay" in config.available_features
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index 8b2d09a..602f457 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -1,5 +1,9 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+#Facebook T92898286
+import lit.util
+#End Facebook T92898286
+
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -39,6 +43,10 @@ config.libcxx_include_target_dir = "@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@"
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-api")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-api")
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 # Plugins
 lldb_build_intel_pt = '@LLDB_BUILD_INTEL_PT@'
 if lldb_build_intel_pt == '1':
diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py
index 255955f..7b7be06 100644
--- a/lldb/test/Shell/helper/toolchain.py
+++ b/lldb/test/Shell/helper/toolchain.py
@@ -165,6 +165,11 @@ def use_support_substitutions(config):
     if config.cmake_sysroot:
         host_flags += ["--sysroot={}".format(config.cmake_sysroot)]
 
+    # Facebook T92898286
+    if config.llvm_test_bolt:
+        host_flags += ["--post-link-optimize"]
+    # End Facebook T92898286
+
     host_flags = " ".join(host_flags)
     config.substitutions.append(("%clang_host", "%clang " + host_flags))
     config.substitutions.append(("%clangxx_host", "%clangxx " + host_flags))
diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in
index b69e7bc..fe83237 100644
--- a/lldb/test/Shell/lit.site.cfg.py.in
+++ b/lldb/test/Shell/lit.site.cfg.py.in
@@ -1,5 +1,10 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+#Facebook T92898286
+import lit.util
+#End Facebook T92898286
+
+
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -31,6 +36,10 @@ config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-shell")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell")
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 3208147..ecd36d2 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -709,6 +709,10 @@ set(LLVM_LIB_FUZZING_ENGINE "" CACHE PATH
 option(LLVM_USE_SPLIT_DWARF
   "Use -gsplit-dwarf when compiling llvm and --gdb-index when linking." OFF)
 
+# Facebook T92898286
+option(LLVM_TEST_BOLT "Enable BOLT testing in non-BOLT tests that use clang" OFF)
+# End Facebook T92898286
+
 # Define an option controlling whether we should build for 32-bit on 64-bit
 # platforms, where supported.
 if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT (WIN32 OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX"))
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index a9b19dc..256d984 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -33,6 +33,7 @@ class MCFragment : public ilist_node_with_parent<MCFragment, MCSection> {
 public:
   enum FragmentType : uint8_t {
     FT_Align,
+    FT_NeverAlign,
     FT_Data,
     FT_CompactEncodedInst,
     FT_Fill,
@@ -344,6 +345,27 @@ public:
   }
 };
 
+class MCNeverAlignFragment : public MCFragment {
+  /// The alignment the end of the next fragment should avoid.
+  unsigned Alignment;
+
+  /// When emitting Nops some subtargets have specific nop encodings.
+  const MCSubtargetInfo &STI;
+
+public:
+  MCNeverAlignFragment(unsigned Alignment, const MCSubtargetInfo &STI,
+                       MCSection *Sec = nullptr)
+      : MCFragment(FT_NeverAlign, false, Sec), Alignment(Alignment), STI(STI) {}
+
+  unsigned getAlignment() const { return Alignment; }
+
+  const MCSubtargetInfo &getSubtargetInfo() const { return STI; }
+
+  static bool classof(const MCFragment *F) {
+    return F->getKind() == MCFragment::FT_NeverAlign;
+  }
+};
+
 class MCFillFragment : public MCFragment {
   uint8_t ValueSize;
   /// Value to use for filling bytes.
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index e212d54..c7d7607 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -157,6 +157,8 @@ public:
                             unsigned MaxBytesToEmit = 0) override;
   void emitCodeAlignment(Align ByteAlignment, const MCSubtargetInfo *STI,
                          unsigned MaxBytesToEmit = 0) override;
+  void emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                               const MCSubtargetInfo &STI) override;
   void emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                          SMLoc Loc) override;
   void emitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column,
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index b7468cf..dd81319 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -887,6 +887,12 @@ public:
   virtual void emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI,
                                  unsigned MaxBytesToEmit = 0);
 
+  /// If the end of the fragment following this NeverAlign fragment ever gets
+  /// aligned to \p ByteAlignment, this fragment emits a single nop before the
+  /// following fragment to break this end-alignment.
+  virtual void emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                                       const MCSubtargetInfo &STI);
+
   /// Emit some number of copies of \p Value until the byte offset \p
   /// Offset is reached.
   ///
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h
index b4ea1ad..c654715 100644
--- a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h
@@ -28,6 +28,7 @@ struct FlowBlock {
   uint64_t Weight{0};
   bool HasUnknownWeight{true};
   bool IsUnlikely{false};
+  bool IsExit{false};
   uint64_t Flow{0};
   std::vector<FlowJump *> SuccJumps;
   std::vector<FlowJump *> PredJumps;
@@ -57,6 +58,8 @@ struct FlowFunction {
   std::vector<FlowJump> Jumps;
   /// The index of the entry block.
   uint64_t Entry{0};
+  // Matched execution count for the function.
+  uint64_t MatchedExecCount{0};
 };
 
 /// Various thresholds and options controlling the behavior of the profile
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index ad30b5c..62baeb9 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -298,6 +298,43 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup,
   return IsResolved;
 }
 
+/// Check if the branch crosses the boundary.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch cross the boundary.
+static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size,
+                             Align BoundaryAlignment) {
+  uint64_t EndAddr = StartAddr + Size;
+  return (StartAddr >> Log2(BoundaryAlignment)) !=
+         ((EndAddr - 1) >> Log2(BoundaryAlignment));
+}
+
+/// Check if the branch is against the boundary.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch is against the boundary.
+static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size,
+                              Align BoundaryAlignment) {
+  uint64_t EndAddr = StartAddr + Size;
+  return (EndAddr & (BoundaryAlignment.value() - 1)) == 0;
+}
+
+/// Check if the branch needs padding.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch needs padding.
+static bool needPadding(uint64_t StartAddr, uint64_t Size,
+                        Align BoundaryAlignment) {
+  return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) ||
+         isAgainstBoundary(StartAddr, Size, BoundaryAlignment);
+}
+
 uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
                                           const MCFragment &F) const {
   assert(getBackendPtr() && "Requires assembler backend");
@@ -358,6 +395,41 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
     return Size;
   }
 
+  case MCFragment::FT_NeverAlign: {
+    // Disclaimer: NeverAlign fragment size depends on the size of its immediate
+    // successor, but NeverAlign need not be a MCRelaxableFragment.
+    // NeverAlign fragment size is recomputed if the successor is relaxed:
+    // - If RelaxableFragment is relaxed, it gets invalidated by marking its
+    // predecessor as LastValidFragment.
+    // - This forces the assembler to call MCAsmLayout::layoutFragment on that
+    // relaxable fragment, which in turn will always ask the predecessor to
+    // compute its size (see "computeFragmentSize(prev)" in layoutFragment).
+    //
+    // In short, the simplest way to ensure that computeFragmentSize() is sane
+    // is to establish the following rule: it should never examine fragments
+    // after the current fragment in the section. If we logically need to
+    // examine any fragment after the current fragment, we need to do that using
+    // relaxation, inside MCAssembler::layoutSectionOnce.
+    const MCNeverAlignFragment &NAF = cast<MCNeverAlignFragment>(F);
+    const MCFragment *NF = F.getNextNode();
+    uint64_t Offset = Layout.getFragmentOffset(&NAF);
+    size_t NextFragSize = 0;
+    if (const auto *NextFrag = dyn_cast<MCRelaxableFragment>(NF)) {
+      NextFragSize = NextFrag->getContents().size();
+    } else if (const auto *NextFrag = dyn_cast<MCDataFragment>(NF)) {
+      NextFragSize = NextFrag->getContents().size();
+    } else {
+      llvm_unreachable("Didn't find the expected fragment after NeverAlign");
+    }
+    // Check if the next fragment ends at the alignment we want to avoid.
+    if (isAgainstBoundary(Offset, NextFragSize, Align(NAF.getAlignment()))) {
+      // Avoid this alignment by introducing minimum nop.
+      assert(getBackend().getMinimumNopSize() != NAF.getAlignment());
+      return getBackend().getMinimumNopSize();
+    }
+    return 0;
+  }
+
   case MCFragment::FT_Org: {
     const MCOrgFragment &OF = cast<MCOrgFragment>(F);
     MCValue Value;
@@ -581,6 +653,15 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
     break;
   }
 
+  case MCFragment::FT_NeverAlign: {
+    const MCNeverAlignFragment &NAF = cast<MCNeverAlignFragment>(F);
+    if (!Asm.getBackend().writeNopData(OS, FragmentSize,
+                                       &NAF.getSubtargetInfo()))
+      report_fatal_error("unable to write nop sequence of " +
+                         Twine(FragmentSize) + " bytes");
+    break;
+  }
+
   case MCFragment::FT_Data:
     ++stats::EmittedDataFragments;
     OS << cast<MCDataFragment>(F).getContents();
@@ -1052,43 +1133,6 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
   return OldSize != LF.getContents().size();
 }
 
-/// Check if the branch crosses the boundary.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch cross the boundary.
-static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size,
-                             Align BoundaryAlignment) {
-  uint64_t EndAddr = StartAddr + Size;
-  return (StartAddr >> Log2(BoundaryAlignment)) !=
-         ((EndAddr - 1) >> Log2(BoundaryAlignment));
-}
-
-/// Check if the branch is against the boundary.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch is against the boundary.
-static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size,
-                              Align BoundaryAlignment) {
-  uint64_t EndAddr = StartAddr + Size;
-  return (EndAddr & (BoundaryAlignment.value() - 1)) == 0;
-}
-
-/// Check if the branch needs padding.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch needs padding.
-static bool needPadding(uint64_t StartAddr, uint64_t Size,
-                        Align BoundaryAlignment) {
-  return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) ||
-         isAgainstBoundary(StartAddr, Size, BoundaryAlignment);
-}
-
 bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout,
                                      MCBoundaryAlignFragment &BF) {
   // BoundaryAlignFragment that doesn't need to align any fragment should not be
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index b065d03..9add574 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -675,14 +675,8 @@ static void AttemptToFoldSymbolOffsetDifference(
     if (FA == FB) {
       Reverse = SA.getOffset() < SB.getOffset();
     } else if (!isa<MCDummyFragment>(FA)) {
-      // Testing FA < FB is slow. Use setLayoutOrder to speed up computation.
-      // The formal layout order will be finalized in MCAssembler::layout.
-      if (FA->getLayoutOrder() == 0 || FB->getLayoutOrder()== 0) {
-        unsigned LayoutOrder = 0;
-        for (MCFragment &F : *FA->getParent())
-          F.setLayoutOrder(++LayoutOrder);
-      }
-      Reverse = FA->getLayoutOrder() < FB->getLayoutOrder();
+      Reverse = std::find_if(std::next(FA->getIterator()), SecA.end(),
+                             [&](auto &I) { return &I == FB; }) != SecA.end();
     }
 
     uint64_t SAOffset = SA.getOffset(), SBOffset = SB.getOffset();
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 7d08268..6e09caa 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -274,6 +274,9 @@ void MCFragment::destroy() {
     case FT_Align:
       delete cast<MCAlignFragment>(this);
       return;
+    case FT_NeverAlign:
+      delete cast<MCNeverAlignFragment>(this);
+      return;
     case FT_Data:
       delete cast<MCDataFragment>(this);
       return;
@@ -342,6 +345,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   OS << "<";
   switch (getKind()) {
   case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
+  case MCFragment::FT_NeverAlign:
+    OS << "MCNeverAlignFragment";
+    break;
   case MCFragment::FT_Data:  OS << "MCDataFragment"; break;
   case MCFragment::FT_CompactEncodedInst:
     OS << "MCCompactEncodedInstFragment"; break;
@@ -381,6 +387,12 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
        << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">";
     break;
   }
+  case MCFragment::FT_NeverAlign: {
+    const MCNeverAlignFragment *NAF = cast<MCNeverAlignFragment>(this);
+    OS << "\n       ";
+    OS << " Alignment:" << NAF->getAlignment() << ">";
+    break;
+  }
   case MCFragment::FT_Data:  {
     const auto *DF = cast<MCDataFragment>(this);
     OS << "\n       ";
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 0ccade9..117475b 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -658,6 +658,11 @@ void MCObjectStreamer::emitCodeAlignment(Align Alignment,
   cast<MCAlignFragment>(getCurrentFragment())->setEmitNops(true, STI);
 }
 
+void MCObjectStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                                               const MCSubtargetInfo &STI) {
+  insert(new MCNeverAlignFragment(ByteAlignment, STI));
+}
+
 void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
                                          unsigned char Value,
                                          SMLoc Loc) {
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 199d865..a97cba6 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1235,6 +1235,8 @@ void MCStreamer::emitValueToAlignment(Align Alignment, int64_t Value,
                                       unsigned MaxBytesToEmit) {}
 void MCStreamer::emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI,
                                    unsigned MaxBytesToEmit) {}
+void MCStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                                         const MCSubtargetInfo &STI) {}
 void MCStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                                    SMLoc Loc) {}
 void MCStreamer::emitBundleAlignMode(Align Alignment) {}
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 6623106..6c6bd2c 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1153,6 +1153,7 @@ private:
   bool parseDirectiveArch();
   bool parseDirectiveNops(SMLoc L);
   bool parseDirectiveEven(SMLoc L);
+  bool parseDirectiveAvoidEndAlign(SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
   /// CodeView FPO data directives.
@@ -4601,6 +4602,8 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   } else if (IDVal == ".nops")
     return parseDirectiveNops(DirectiveID.getLoc());
+  else if (IDVal == ".avoid_end_align")
+    return parseDirectiveAvoidEndAlign(DirectiveID.getLoc());
   else if (IDVal == ".even")
     return parseDirectiveEven(DirectiveID.getLoc());
   else if (IDVal == ".cv_fpo_proc")
@@ -4695,6 +4698,27 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
   return false;
 }
 
+/// Directive for NeverAlign fragment testing, not for general usage!
+/// parseDirectiveAvoidEndAlign
+///  ::= .avoid_end_align alignment
+bool X86AsmParser::parseDirectiveAvoidEndAlign(SMLoc L) {
+  int64_t Alignment = 0;
+  SMLoc AlignmentLoc;
+  AlignmentLoc = getTok().getLoc();
+  if (getParser().checkForValidSection() ||
+      getParser().parseAbsoluteExpression(Alignment))
+    return true;
+
+  if (getParser().parseEOL("unexpected token in directive"))
+    return true;
+
+  if (Alignment <= 0)
+    return Error(AlignmentLoc, "expected a positive alignment");
+
+  getParser().getStreamer().emitNeverAlignCodeAtEnd(Alignment, getSTI());
+  return false;
+}
+
 /// ParseDirectiveCode
 ///  ::= .code16 | .code32 | .code64
 bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
diff --git a/llvm/test/MC/X86/directive-avoid_end_align.s b/llvm/test/MC/X86/directive-avoid_end_align.s
new file mode 100644
index 0000000..1d74840
--- /dev/null
+++ b/llvm/test/MC/X86/directive-avoid_end_align.s
@@ -0,0 +1,208 @@
+# RUN: llvm-mc -triple=x86_64 -filetype=obj %s | llvm-objdump --no-show-raw-insn -d - | FileCheck %s
+# RUN: not llvm-mc -triple=x86_64 --defsym ERR=1 %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR
+
+# avoid_end_align has no effect since test doesn't end at alignment boundary:
+.avoid_end_align 64
+# CHECK-NOT: nop
+  testl %eax, %eax
+# CHECK: testl %eax, %eax
+  je  .LBB0
+
+.fill 58, 1, 0x00
+# NeverAlign followed by MCDataFragment:
+# avoid_end_align inserts nop because `test` would end at alignment boundary:
+.avoid_end_align 64
+# CHECK: 			3e: nop
+  testl %eax, %eax
+# CHECK-NEXT: 3f: testl %eax, %eax
+  je  .LBB0
+# CHECK-NEXT: 41: je
+.LBB0:
+  retq
+
+.p2align 6
+.L0:
+.nops 57
+  int3
+# NeverAlign followed by RelaxableFragment:
+.avoid_end_align 64
+# CHECK: 			ba: nop
+  cmpl $(.L1-.L0), %eax
+# CHECK-NEXT: bb: cmpl
+  je  .L0
+# CHECK-NEXT: c1: je
+.nops 65
+.L1:
+
+###############################################################################
+# Experiment A:
+# Check that NeverAlign doesn't introduce infinite loops in layout.
+# Control:
+# 1. NeverAlign fragment is not added,
+# 2. Short formats of cmp and jcc are used (3 and 2 bytes respectively),
+# 3. cmp and jcc are placed such that to be split by 64B alignment boundary.
+# 4. jcc would be relaxed to a longer format if at least one byte is added
+#    between .L10 and je itself, e.g. by adding a NeverAlign padding byte,
+#    or relaxing cmp instruction.
+# 5. cmp would be relaxed to a longer format if at least one byte is added
+#    between .L11 and .L12, e.g. due to relaxing jcc instruction.
+.p2align 6
+# CHECK:      140: int3
+.fill 2, 1, 0xcc
+.L10:
+.nops 122
+  int3
+# CHECK:      1bc: int3
+# no avoid_end_align here
+# CHECK-NOT:  nop
+  cmp $(.L12-.L11), %eax
+# CHECK:      1bd: cmpl
+.L11:
+  je  .L10
+# CHECK-NEXT: 1c0: je
+.nops 125
+.L12:
+
+# Experiment:
+# Same setup as control, except NeverAlign fragment is added before cmp.
+# Expected effect:
+# 1. NeverAlign pads cmp+jcc by one byte since cmp and jcc are split by a 64B
+#    alignment boundary,
+# 2. This extra byte forces jcc relaxation to a longer format (Control rule #4),
+# 3. This results in an cmp relaxation (Control rule #5),
+# 4. Which in turn makes NeverAlign fragment unnecessary as cmp and jcc
+#    are no longer split by an alignment boundary (cmp crosses the boundary).
+# 5. NeverAlign padding is removed.
+# 6. cmp and jcc instruction remain in relaxed form.
+# 7. Relaxation converges, layout succeeds.
+.p2align 6
+# CHECK:      240: int3
+.fill 2, 1, 0xcc
+.L20:
+.nops 122
+  int3
+# CHECK:      2bc: int3
+.avoid_end_align 64
+# CHECK-NOT: 	nop
+  cmp $(.L22-.L21), %eax
+# CHECK-NEXT: 2bd: cmpl
+.L21:
+  je  .L20
+# CHECK-NEXT: 2c3: je
+.nops 125
+.L22:
+
+###############################################################################
+# Experiment B: similar to exp A, but we check that once NeverAlign padding is
+# removed from the layout (exp A, experiment step 5), the increased distance
+# between the symbols L33 and L34 triggers the relaxation of instruction at
+# label L32.
+#
+# Control 1: using a one-byte instruction at L33 (site of NeverAlign) leads to
+# steps 2-3 of exp A, experiment:
+# 2. This extra byte forces jcc relaxation to a longer format (Control rule #4),
+# 3. This results in an cmp relaxation (Control rule #5),
+# => short cmp under L32
+.p2align 6
+# CHECK:      380: int3
+.fill 2, 1, 0xcc
+.L30:
+.nops 122
+  int3
+# CHECK:      3fc: int3
+  hlt
+#.avoid_end_align 64
+.L33:
+  cmp $(.L32-.L31), %eax
+# CHECK:      3fe: cmpl
+.L31:
+  je  .L30
+# CHECK-NEXT: 404: je
+.nops 114
+.p2align 1
+  int3
+  int3
+# CHECK:      47c: int3
+.L34:
+.nops 9
+.L32:
+  cmp $(.L33-.L34), %eax
+# CHECK:      487: cmp
+# note that the size of cmp is 48a-487 == 3 bytes (distance is exactly -128)
+  int3
+# CHECK-NEXT: 48a: int3
+
+# Control 2: leaving out a byte at L43 (site of NeverAlign), plus
+# relaxed jcc and cmp leads to a relaxed cmp under L42 (-129 as cmp's immediate)
+.p2align 6
+# CHECK:      4c0: int3
+.fill 2, 1, 0xcc
+.L40:
+.nops 122
+  int3
+# CHECK:      53c: int3
+#  int3
+#.avoid_end_align 64
+.L43:
+  cmp $(.L42-.L41+0x100), %eax
+# CHECK:      53d: cmpl
+.L41:
+  je  .L40+0x100
+# CHECK-NEXT: 543: je
+.nops 114
+.p2align 1
+  int3
+  int3
+# CHECK:      5bc: int3
+.L44:
+.nops 9
+.L42:
+  cmp $(.L43-.L44), %eax
+# CHECK:      5c7: cmp
+# note that the size of cmp is 5cd-5c7 == 6 bytes (distance is exactly -129)
+  int3
+# CHECK-NEXT: 5cd: int3
+
+# Experiment
+# Checking if removing NeverAlign padding at L53 as a result of alignment and
+# relaxation of cmp and jcc following it (see exp A), thus reproducing the case
+# in Control 2 (getting a relaxed cmp under L52), is handled correctly.
+.p2align 6
+# CHECK:      600: int3
+.fill 2, 1, 0xcc
+.L50:
+.nops 122
+  int3
+# CHECK:      67c: int3
+.avoid_end_align 64
+.L53:
+# CHECK-NOT: 	nop
+  cmp $(.L52-.L51), %eax
+# CHECK-NEXT: 67d: cmpl
+.L51:
+  je  .L50
+# CHECK-NEXT: 683: je
+.nops 114
+.p2align 1
+  int3
+  int3
+# CHECK:      6fc: int3
+.L54:
+.nops 9
+.L52:
+  cmp $(.L53-.L54), %eax
+# CHECK:      707: cmp
+# note that the size of cmp is 70d-707 == 6 bytes (distance is exactly -129)
+  int3
+# CHECK-NEXT: 70d: int3
+
+.ifdef ERR
+# ERR: {{.*}}.s:[[#@LINE+1]]:17: error: unknown token in expression
+.avoid_end_align
+# ERR: {{.*}}.s:[[#@LINE+1]]:18: error: expected absolute expression
+.avoid_end_align x
+# ERR: {{.*}}.s:[[#@LINE+1]]:18: error: expected a positive alignment
+.avoid_end_align 0
+# ERR: {{.*}}.s:[[#@LINE+1]]:20: error: unexpected token in directive
+.avoid_end_align 64, 0
+.endif
author	shawbyoung <shawbyoung@gmail.com>	2024-06-11 11:31:37 -0700
committer	shawbyoung <shawbyoung@gmail.com>	2024-06-11 11:31:37 -0700
commit	2289286c92696109e2d63a977eb845ea4221f199 (patch)
tree	90e10f09ae908b875e0a3b4eccdd4c90d8d18989
parent	338cbfef03e0ab58d7b52f3301928c58b194a1b4 (diff)
parent	5a183532e960444088aa4d76d4686d7c93ba79b9 (diff)
download	llvm-users/shawbyoung/spr/updated-commandlineargumentreferencemd.zip llvm-users/shawbyoung/spr/updated-commandlineargumentreferencemd.tar.gz llvm-users/shawbyoung/spr/updated-commandlineargumentreferencemd.tar.bz2