aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rwxr-xr-xllvm/lib/Analysis/ConstantFolding.cpp24
-rw-r--r--llvm/lib/Analysis/Loads.cpp4
-rw-r--r--llvm/lib/Analysis/ScalarEvolution.cpp46
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp2
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp3
-rw-r--r--llvm/lib/CodeGen/MIR2Vec.cpp48
-rw-r--r--llvm/lib/CodeGen/MachinePipeliner.cpp25
-rw-r--r--llvm/lib/IR/DIBuilder.cpp19
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp526
-rw-r--r--llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h15
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp159
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/Hexagon/Hexagon.td3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonPatterns.td7
-rw-r--r--llvm/lib/Target/Hexagon/HexagonSubtarget.h2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp7
-rw-r--r--llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp13
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp16
22 files changed, 528 insertions, 410 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index b744537..31546e6 100755
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1495,22 +1495,22 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
default:
llvm_unreachable("Missing case");
case Instruction::PtrToAddr:
- // TODO: Add some of the ptrtoint folds here as well.
- break;
case Instruction::PtrToInt:
if (auto *CE = dyn_cast<ConstantExpr>(C)) {
Constant *FoldedValue = nullptr;
- // If the input is a inttoptr, eliminate the pair. This requires knowing
+ // If the input is an inttoptr, eliminate the pair. This requires knowing
// the width of a pointer, so it can't be done in ConstantExpr::getCast.
if (CE->getOpcode() == Instruction::IntToPtr) {
- // zext/trunc the inttoptr to pointer size.
- FoldedValue = ConstantFoldIntegerCast(CE->getOperand(0),
- DL.getIntPtrType(CE->getType()),
+ // zext/trunc the inttoptr to pointer/address size.
+ Type *MidTy = Opcode == Instruction::PtrToInt
+ ? DL.getAddressType(CE->getType())
+ : DL.getIntPtrType(CE->getType());
+ FoldedValue = ConstantFoldIntegerCast(CE->getOperand(0), MidTy,
/*IsSigned=*/false, DL);
} else if (auto *GEP = dyn_cast<GEPOperator>(CE)) {
// If we have GEP, we can perform the following folds:
- // (ptrtoint (gep null, x)) -> x
- // (ptrtoint (gep (gep null, x), y) -> x + y, etc.
+ // (ptrtoint/ptrtoaddr (gep null, x)) -> x
+ // (ptrtoint/ptrtoaddr (gep (gep null, x), y) -> x + y, etc.
unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
APInt BaseOffset(BitWidth, 0);
auto *Base = cast<Constant>(GEP->stripAndAccumulateConstantOffsets(
@@ -1518,7 +1518,8 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
if (Base->isNullValue()) {
FoldedValue = ConstantInt::get(CE->getContext(), BaseOffset);
} else {
- // ptrtoint (gep i8, Ptr, (sub 0, V)) -> sub (ptrtoint Ptr), V
+ // ptrtoint/ptrtoaddr (gep i8, Ptr, (sub 0, V))
+ // -> sub (ptrtoint/ptrtoaddr Ptr), V
if (GEP->getNumIndices() == 1 &&
GEP->getSourceElementType()->isIntegerTy(8)) {
auto *Ptr = cast<Constant>(GEP->getPointerOperand());
@@ -1528,12 +1529,13 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
Sub->getOpcode() == Instruction::Sub &&
Sub->getOperand(0)->isNullValue())
FoldedValue = ConstantExpr::getSub(
- ConstantExpr::getPtrToInt(Ptr, IntIdxTy), Sub->getOperand(1));
+ ConstantExpr::getCast(Opcode, Ptr, IntIdxTy),
+ Sub->getOperand(1));
}
}
}
if (FoldedValue) {
- // Do a zext or trunc to get to the ptrtoint dest size.
+ // Do a zext or trunc to get to the ptrtoint/ptrtoaddr dest size.
return ConstantFoldIntegerCast(FoldedValue, DestTy, /*IsSigned=*/false,
DL);
}
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 4c2e1fe..54f55b2 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -812,7 +812,9 @@ static bool isPointerUseReplacable(const Use &U) {
auto *User = Worklist.pop_back_val();
if (!Visited.insert(User).second)
continue;
- if (isa<ICmpInst, PtrToIntInst>(User))
+ // FIXME: The PtrToIntInst case here is not strictly correct, as it
+ // changes which provenance is exposed.
+ if (isa<ICmpInst, PtrToIntInst, PtrToAddrInst>(User))
continue;
if (isa<PHINode, SelectInst>(User))
Worklist.append(User->user_begin(), User->user_end());
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 6f6776c..30bcff7 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -15749,51 +15749,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
return RewriteMap.lookup_or(S, S);
};
- // Check for the SCEV expression (A /u B) * B while B is a constant, inside
- // \p Expr. The check is done recuresively on \p Expr, which is assumed to
- // be a composition of Min/Max SCEVs. Return whether the SCEV expression (A
- // /u B) * B was found, and return the divisor B in \p DividesBy. For
- // example, if Expr = umin (umax ((A /u 8) * 8, 16), 64), return true since
- // (A /u 8) * 8 matched the pattern, and return the constant SCEV 8 in \p
- // DividesBy.
- std::function<bool(const SCEV *, const SCEV *&)> HasDivisibiltyInfo =
- [&](const SCEV *Expr, const SCEV *&DividesBy) {
- if (auto *Mul = dyn_cast<SCEVMulExpr>(Expr)) {
- if (Mul->getNumOperands() != 2)
- return false;
- auto *MulLHS = Mul->getOperand(0);
- auto *MulRHS = Mul->getOperand(1);
- if (isa<SCEVConstant>(MulLHS))
- std::swap(MulLHS, MulRHS);
- if (auto *Div = dyn_cast<SCEVUDivExpr>(MulLHS))
- if (Div->getOperand(1) == MulRHS) {
- DividesBy = MulRHS;
- return true;
- }
- }
- if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr))
- return HasDivisibiltyInfo(MinMax->getOperand(0), DividesBy) ||
- HasDivisibiltyInfo(MinMax->getOperand(1), DividesBy);
- return false;
- };
-
- // Return true if Expr known to divide by \p DividesBy.
- std::function<bool(const SCEV *, const SCEV *&)> IsKnownToDivideBy =
- [&](const SCEV *Expr, const SCEV *DividesBy) {
- if (SE.getURemExpr(Expr, DividesBy)->isZero())
- return true;
- if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr))
- return IsKnownToDivideBy(MinMax->getOperand(0), DividesBy) &&
- IsKnownToDivideBy(MinMax->getOperand(1), DividesBy);
- return false;
- };
-
const SCEV *RewrittenLHS = GetMaybeRewritten(LHS);
const SCEV *DividesBy = nullptr;
- if (HasDivisibiltyInfo(RewrittenLHS, DividesBy))
- // Check that the whole expression is divided by DividesBy
- DividesBy =
- IsKnownToDivideBy(RewrittenLHS, DividesBy) ? DividesBy : nullptr;
+ const APInt &Multiple = SE.getConstantMultiple(RewrittenLHS);
+ if (!Multiple.isOne())
+ DividesBy = SE.getConstant(Multiple);
// Collect rewrites for LHS and its transitive operands based on the
// condition.
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index fa0ccd6..906d62a3 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1215,7 +1215,7 @@ bool CombinerHelper::isIndexedLoadStoreLegal(GLoadStore &LdSt) const {
LLT MemTy = LdSt.getMMO().getMemoryType();
SmallVector<LegalityQuery::MemDesc, 2> MemDescrs(
{{MemTy, MemTy.getSizeInBits().getKnownMinValue(),
- AtomicOrdering::NotAtomic}});
+ AtomicOrdering::NotAtomic, AtomicOrdering::NotAtomic}});
unsigned IndexedOpc = getIndexedOpc(LdSt.getOpcode());
SmallVector<LLT> OpTys;
if (IndexedOpc == TargetOpcode::G_INDEXED_STORE)
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index b2f8435..cdc1f64 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -958,7 +958,8 @@ void LoadStoreOpt::initializeStoreMergeTargetInfo(unsigned AddrSpace) {
for (unsigned Size = 2; Size <= MaxStoreSizeToForm; Size *= 2) {
LLT Ty = LLT::scalar(Size);
SmallVector<LegalityQuery::MemDesc, 2> MemDescrs(
- {{Ty, Ty.getSizeInBits(), AtomicOrdering::NotAtomic}});
+ {{Ty, Ty.getSizeInBits(), AtomicOrdering::NotAtomic,
+ AtomicOrdering::NotAtomic}});
SmallVector<LLT> StoreTys({Ty, PtrTy});
LegalityQuery Q(TargetOpcode::G_STORE, StoreTys, MemDescrs);
LegalizeActionStep ActionStep = LI.getAction(Q);
diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp
index 87565c0..e859765 100644
--- a/llvm/lib/CodeGen/MIR2Vec.cpp
+++ b/llvm/lib/CodeGen/MIR2Vec.cpp
@@ -49,14 +49,8 @@ cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
//===----------------------------------------------------------------------===//
MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries,
- const TargetInstrInfo *TII)
- : TII(*TII) {
- // Fixme: Use static factory methods for creating vocabularies instead of
- // public constructors
- // Early return for invalid inputs - creates empty/invalid vocabulary
- if (!TII || OpcodeEntries.empty())
- return;
-
+ const TargetInstrInfo &TII)
+ : TII(TII) {
buildCanonicalOpcodeMapping();
unsigned CanonicalOpcodeCount = UniqueBaseOpcodeNames.size();
@@ -67,6 +61,15 @@ MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries,
Layout.TotalEntries = Storage.size();
}
+Expected<MIRVocabulary> MIRVocabulary::create(VocabMap &&Entries,
+ const TargetInstrInfo &TII) {
+ if (Entries.empty())
+ return createStringError(errc::invalid_argument,
+ "Empty vocabulary entries provided");
+
+ return MIRVocabulary(std::move(Entries), TII);
+}
+
std::string MIRVocabulary::extractBaseOpcodeName(StringRef InstrName) {
// Extract base instruction name using regex to capture letters and
// underscores Examples: "ADD32rr" -> "ADD", "ARITH_FENCE" -> "ARITH_FENCE"
@@ -107,13 +110,11 @@ unsigned MIRVocabulary::getCanonicalIndexForBaseName(StringRef BaseName) const {
}
unsigned MIRVocabulary::getCanonicalOpcodeIndex(unsigned Opcode) const {
- assert(isValid() && "MIR2Vec Vocabulary is invalid");
auto BaseOpcode = extractBaseOpcodeName(TII.getName(Opcode));
return getCanonicalIndexForBaseName(BaseOpcode);
}
std::string MIRVocabulary::getStringKey(unsigned Pos) const {
- assert(isValid() && "MIR2Vec Vocabulary is invalid");
assert(Pos < Layout.TotalEntries && "Position out of bounds in vocabulary");
// For now, all entries are opcodes since we only have one section
@@ -232,16 +233,11 @@ Error MIR2VecVocabLegacyAnalysis::readVocabulary() {
return Error::success();
}
-void MIR2VecVocabLegacyAnalysis::emitError(Error Err, LLVMContext &Ctx) {
- Ctx.emitError(toString(std::move(Err)));
-}
-
-mir2vec::MIRVocabulary
+Expected<mir2vec::MIRVocabulary>
MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) {
if (StrVocabMap.empty()) {
if (Error Err = readVocabulary()) {
- emitError(std::move(Err), M.getContext());
- return mir2vec::MIRVocabulary(std::move(StrVocabMap), nullptr);
+ return std::move(Err);
}
}
@@ -255,15 +251,13 @@ MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) {
if (auto *MF = MMI.getMachineFunction(F)) {
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
- return mir2vec::MIRVocabulary(std::move(StrVocabMap), TII);
+ return mir2vec::MIRVocabulary::create(std::move(StrVocabMap), *TII);
}
}
- // No machine functions available - return invalid vocabulary
- emitError(make_error<StringError>("No machine functions found in module",
- inconvertibleErrorCode()),
- M.getContext());
- return mir2vec::MIRVocabulary(std::move(StrVocabMap), nullptr);
+ // No machine functions available - return error
+ return createStringError(errc::invalid_argument,
+ "No machine functions found in module");
}
//===----------------------------------------------------------------------===//
@@ -284,13 +278,15 @@ bool MIR2VecVocabPrinterLegacyPass::runOnMachineFunction(MachineFunction &MF) {
bool MIR2VecVocabPrinterLegacyPass::doFinalization(Module &M) {
auto &Analysis = getAnalysis<MIR2VecVocabLegacyAnalysis>();
- auto MIR2VecVocab = Analysis.getMIR2VecVocabulary(M);
+ auto MIR2VecVocabOrErr = Analysis.getMIR2VecVocabulary(M);
- if (!MIR2VecVocab.isValid()) {
- OS << "MIR2Vec Vocabulary Printer: Invalid vocabulary\n";
+ if (!MIR2VecVocabOrErr) {
+ OS << "MIR2Vec Vocabulary Printer: Failed to get vocabulary - "
+ << toString(MIR2VecVocabOrErr.takeError()) << "\n";
return false;
}
+ auto &MIR2VecVocab = *MIR2VecVocabOrErr;
unsigned Pos = 0;
for (const auto &Entry : MIR2VecVocab) {
OS << "Key: " << MIR2VecVocab.getStringKey(Pos++) << ": ";
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 3a9651c..89ed4da 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -110,6 +110,7 @@ STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
+STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores");
/// A command line option to turn software pipelining on or off.
static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
@@ -193,6 +194,13 @@ static cl::opt<bool>
MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
cl::desc("Use the MVE code generator for software pipelining"));
+/// A command line argument to limit the number of store instructions in the
+/// target basic block.
+static cl::opt<unsigned> SwpMaxNumStores(
+ "pipeliner-max-num-stores",
+ cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
+ cl::init(200));
+
namespace llvm {
// A command line option to enable the CopyToPhi DAG mutation.
@@ -544,6 +552,23 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
return false;
}
+ unsigned NumStores = 0;
+ for (MachineInstr &MI : *L.getHeader())
+ if (MI.mayStore())
+ ++NumStores;
+ if (NumStores > SwpMaxNumStores) {
+ LLVM_DEBUG(dbgs() << "Too many stores\n");
+ NumFailTooManyStores++;
+ ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+ L.getStartLoc(), L.getHeader())
+ << "Too many store instructions in the loop: "
+ << ore::NV("NumStores", NumStores) << " > "
+ << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << ".";
+ });
+ return false;
+ }
+
// Remove any subregisters from inputs to phi nodes.
preprocessPhiNodes(*L.getHeader());
return true;
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 1ae20a9f..07a870f 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -715,11 +715,20 @@ DICompositeType *DIBuilder::createArrayType(
DICompositeType *DIBuilder::createVectorType(uint64_t Size,
uint32_t AlignInBits, DIType *Ty,
- DINodeArray Subscripts) {
- auto *R = DICompositeType::get(VMContext, dwarf::DW_TAG_array_type, "",
- nullptr, 0, nullptr, Ty, Size, AlignInBits, 0,
- DINode::FlagVector, Subscripts, 0,
- /*EnumKind=*/std::nullopt, nullptr);
+ DINodeArray Subscripts,
+ Metadata *BitStride) {
+ auto *R = DICompositeType::get(
+ VMContext, dwarf::DW_TAG_array_type, /*Name=*/"",
+ /*File=*/nullptr, /*Line=*/0, /*Scope=*/nullptr, /*BaseType=*/Ty,
+ /*SizeInBits=*/Size, /*AlignInBits=*/AlignInBits, /*OffsetInBits=*/0,
+ /*Flags=*/DINode::FlagVector, /*Elements=*/Subscripts,
+ /*RuntimeLang=*/0, /*EnumKind=*/std::nullopt, /*VTableHolder=*/nullptr,
+ /*TemplateParams=*/nullptr, /*Identifier=*/"",
+ /*Discriminator=*/nullptr, /*DataLocation=*/nullptr,
+ /*Associated=*/nullptr, /*Allocated=*/nullptr, /*Rank=*/nullptr,
+ /*Annotations=*/nullptr, /*Specification=*/nullptr,
+ /*NumExtraInhabitants=*/0,
+ /*BitStride=*/BitStride);
trackIfUnresolved(R);
return R;
}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 91e64e6..bd0a17d 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -315,6 +315,8 @@ public:
}
void setStackSizeSVE(uint64_t ZPR, uint64_t PPR) {
+ assert(isAligned(Align(16), ZPR) && isAligned(Align(16), PPR) &&
+ "expected SVE stack sizes to be aligned to 16-bytes");
StackSizeZPR = ZPR;
StackSizePPR = PPR;
HasCalculatedStackSizeSVE = true;
@@ -425,6 +427,8 @@ public:
// Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes'
void setSVECalleeSavedStackSize(unsigned ZPR, unsigned PPR) {
+ assert(isAligned(Align(16), ZPR) && isAligned(Align(16), PPR) &&
+ "expected SVE callee-save sizes to be aligned to 16-bytes");
ZPRCalleeSavedStackSize = ZPR;
PPRCalleeSavedStackSize = PPR;
HasSVECalleeSavedStackSize = true;
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 1568161..f110558 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -60,7 +60,6 @@ static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) {
case AArch64::PTRUE_C_B:
return I->getFlag(MachineInstr::FrameSetup) ||
I->getFlag(MachineInstr::FrameDestroy);
- case AArch64::SEH_SavePReg:
case AArch64::SEH_SaveZReg:
return true;
}
@@ -75,6 +74,8 @@ static bool isPartOfPPRCalleeSaves(MachineBasicBlock::iterator I) {
case AArch64::LDR_PXI:
return I->getFlag(MachineInstr::FrameSetup) ||
I->getFlag(MachineInstr::FrameDestroy);
+ case AArch64::SEH_SavePReg:
+ return true;
}
}
@@ -94,6 +95,26 @@ AArch64PrologueEpilogueCommon::AArch64PrologueEpilogueCommon(
HasFP = AFL.hasFP(MF);
NeedsWinCFI = AFL.needsWinCFI(MF);
+
+ // Windows unwind can't represent the required stack adjustments if we have
+ // both SVE callee-saves and dynamic stack allocations, and the frame pointer
+ // is before the SVE spills. The allocation of the frame pointer must be the
+ // last instruction in the prologue so the unwinder can restore the stack
+ // pointer correctly. (And there isn't any unwind opcode for `addvl sp, x29,
+ // -17`.)
+ //
+ // Because of this, we do spills in the opposite order on Windows: first SVE,
+ // then GPRs. The main side-effect of this is that it makes accessing
+ // parameters passed on the stack more expensive.
+ //
+ // We could consider rearranging the spills for simpler cases.
+ if (Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize()) {
+ if (AFI->hasStackHazardSlotIndex())
+ reportFatalUsageError("SME hazard padding is not supported on Windows");
+ SVELayout = SVEStackLayout::CalleeSavesAboveFrameRecord;
+ } else if (AFI->hasSplitSVEObjects()) {
+ SVELayout = SVEStackLayout::Split;
+ }
}
MachineBasicBlock::iterator
@@ -334,6 +355,55 @@ bool AArch64PrologueEpilogueCommon::shouldCombineCSRLocalStackBump(
return true;
}
+SVEFrameSizes AArch64PrologueEpilogueCommon::getSVEStackFrameSizes() const {
+ StackOffset PPRCalleeSavesSize =
+ StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
+ StackOffset ZPRCalleeSavesSize =
+ StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
+ StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize;
+ StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize;
+ if (SVELayout == SVEStackLayout::Split)
+ return {{PPRCalleeSavesSize, PPRLocalsSize},
+ {ZPRCalleeSavesSize, ZPRLocalsSize}};
+ // For simplicity, attribute all locals to ZPRs when split SVE is disabled.
+ return {{PPRCalleeSavesSize, StackOffset{}},
+ {ZPRCalleeSavesSize, PPRLocalsSize + ZPRLocalsSize}};
+}
+
+struct SVEPartitions {
+ struct {
+ MachineBasicBlock::iterator Begin, End;
+ } PPR, ZPR;
+};
+
+static SVEPartitions partitionSVECS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ StackOffset PPRCalleeSavesSize,
+ StackOffset ZPRCalleeSavesSize,
+ bool IsEpilogue) {
+ MachineBasicBlock::iterator PPRsI = MBBI;
+ MachineBasicBlock::iterator End =
+ IsEpilogue ? MBB.begin() : MBB.getFirstTerminator();
+ auto AdjustI = [&](auto MBBI) { return IsEpilogue ? std::prev(MBBI) : MBBI; };
+ // Process the SVE CS to find the starts/ends of the ZPR and PPR areas.
+ if (PPRCalleeSavesSize) {
+ PPRsI = AdjustI(PPRsI);
+ assert(isPartOfPPRCalleeSaves(*PPRsI) && "Unexpected instruction");
+ while (PPRsI != End && isPartOfPPRCalleeSaves(AdjustI(PPRsI)))
+ IsEpilogue ? (--PPRsI) : (++PPRsI);
+ }
+ MachineBasicBlock::iterator ZPRsI = PPRsI;
+ if (ZPRCalleeSavesSize) {
+ ZPRsI = AdjustI(ZPRsI);
+ assert(isPartOfZPRCalleeSaves(*ZPRsI) && "Unexpected instruction");
+ while (ZPRsI != End && isPartOfZPRCalleeSaves(AdjustI(ZPRsI)))
+ IsEpilogue ? (--ZPRsI) : (++ZPRsI);
+ }
+ if (IsEpilogue)
+ return {{PPRsI, MBBI}, {ZPRsI, PPRsI}};
+ return {{MBBI, PPRsI}, {PPRsI, ZPRsI}};
+}
+
AArch64PrologueEmitter::AArch64PrologueEmitter(MachineFunction &MF,
MachineBasicBlock &MBB,
const AArch64FrameLowering &AFL)
@@ -613,30 +683,12 @@ void AArch64PrologueEmitter::emitPrologue() {
bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
unsigned FixedObject = AFL.getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
- // Windows unwind can't represent the required stack adjustments if we have
- // both SVE callee-saves and dynamic stack allocations, and the frame
- // pointer is before the SVE spills. The allocation of the frame pointer
- // must be the last instruction in the prologue so the unwinder can restore
- // the stack pointer correctly. (And there isn't any unwind opcode for
- // `addvl sp, x29, -17`.)
- //
- // Because of this, we do spills in the opposite order on Windows: first SVE,
- // then GPRs. The main side-effect of this is that it makes accessing
- // parameters passed on the stack more expensive.
- //
- // We could consider rearranging the spills for simpler cases.
- bool FPAfterSVECalleeSaves =
- Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
-
- if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex())
- reportFatalUsageError("SME hazard padding is not supported on Windows");
-
auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
// All of the remaining stack allocations are for locals.
determineLocalsStackSize(NumBytes, PrologueSaveSize);
MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI;
- if (FPAfterSVECalleeSaves) {
+ if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
// If we're doing SVE saves first, we need to immediately allocate space
// for fixed objects, then space for the SVE callee saves.
//
@@ -712,110 +764,66 @@ void AArch64PrologueEmitter::emitPrologue() {
if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding))
emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding);
- StackOffset PPRCalleeSavesSize =
- StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
- StackOffset ZPRCalleeSavesSize =
- StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
- StackOffset SVECalleeSavesSize = PPRCalleeSavesSize + ZPRCalleeSavesSize;
- StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize;
- StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize;
-
- std::optional<MachineBasicBlock::iterator> ZPRCalleeSavesBegin,
- ZPRCalleeSavesEnd, PPRCalleeSavesBegin, PPRCalleeSavesEnd;
-
+ auto [PPR, ZPR] = getSVEStackFrameSizes();
+ StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
+ StackOffset NonSVELocalsSize = StackOffset::getFixed(NumBytes);
StackOffset CFAOffset =
- StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
+ StackOffset::getFixed(MFI.getStackSize()) - NonSVELocalsSize;
+
MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI;
- if (!FPAfterSVECalleeSaves) {
- // Process the SVE callee-saves to find the starts/ends of the ZPR and PPR
- // areas.
- PPRCalleeSavesBegin = AfterGPRSavesI;
- if (PPRCalleeSavesSize) {
- LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = "
- << PPRCalleeSavesSize.getScalable() << "\n");
-
- assert(isPartOfPPRCalleeSaves(*PPRCalleeSavesBegin) &&
- "Unexpected instruction");
- while (isPartOfPPRCalleeSaves(AfterSVESavesI) &&
- AfterSVESavesI != MBB.getFirstTerminator())
- ++AfterSVESavesI;
+ // Allocate space for the callee saves and PPR locals (if any).
+ if (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord) {
+ auto [PPRRange, ZPRRange] =
+ partitionSVECS(MBB, AfterGPRSavesI, PPR.CalleeSavesSize,
+ ZPR.CalleeSavesSize, /*IsEpilogue=*/false);
+ AfterSVESavesI = ZPRRange.End;
+ if (EmitAsyncCFI)
+ emitCalleeSavedSVELocations(AfterSVESavesI);
+
+ StackOffset AllocateBeforePPRs = SVECalleeSavesSize;
+ StackOffset AllocateAfterPPRs = PPR.LocalsSize;
+ if (SVELayout == SVEStackLayout::Split) {
+ AllocateBeforePPRs = PPR.CalleeSavesSize;
+ AllocateAfterPPRs = PPR.LocalsSize + ZPR.CalleeSavesSize;
}
- PPRCalleeSavesEnd = ZPRCalleeSavesBegin = AfterSVESavesI;
- if (ZPRCalleeSavesSize) {
- LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = "
- << ZPRCalleeSavesSize.getScalable() << "\n");
- assert(isPartOfZPRCalleeSaves(*ZPRCalleeSavesBegin) &&
- "Unexpected instruction");
- while (isPartOfZPRCalleeSaves(AfterSVESavesI) &&
- AfterSVESavesI != MBB.getFirstTerminator())
- ++AfterSVESavesI;
- }
- ZPRCalleeSavesEnd = AfterSVESavesI;
- }
-
- if (EmitAsyncCFI)
- emitCalleeSavedSVELocations(AfterSVESavesI);
-
- if (AFI->hasSplitSVEObjects()) {
- assert(!FPAfterSVECalleeSaves &&
- "Cannot use FPAfterSVECalleeSaves with aarch64-split-sve-objects");
- assert(!AFL.canUseRedZone(MF) &&
- "Cannot use redzone with aarch64-split-sve-objects");
- // TODO: Handle HasWinCFI/NeedsWinCFI?
- assert(!NeedsWinCFI &&
- "WinCFI with aarch64-split-sve-objects is not supported");
-
- // Split ZPR and PPR allocation.
- // Allocate PPR callee saves
- allocateStackSpace(*PPRCalleeSavesBegin, 0, PPRCalleeSavesSize,
+ allocateStackSpace(PPRRange.Begin, 0, AllocateBeforePPRs,
EmitAsyncCFI && !HasFP, CFAOffset,
- MFI.hasVarSizedObjects() || ZPRCalleeSavesSize ||
- ZPRLocalsSize || PPRLocalsSize);
- CFAOffset += PPRCalleeSavesSize;
-
- // Allocate PPR locals + ZPR callee saves
- assert(PPRCalleeSavesEnd == ZPRCalleeSavesBegin &&
+ MFI.hasVarSizedObjects() || AllocateAfterPPRs ||
+ ZPR.LocalsSize || NonSVELocalsSize);
+ CFAOffset += AllocateBeforePPRs;
+ assert(PPRRange.End == ZPRRange.Begin &&
"Expected ZPR callee saves after PPR locals");
- allocateStackSpace(*PPRCalleeSavesEnd, RealignmentPadding,
- PPRLocalsSize + ZPRCalleeSavesSize,
- EmitAsyncCFI && !HasFP, CFAOffset,
- MFI.hasVarSizedObjects() || ZPRLocalsSize);
- CFAOffset += PPRLocalsSize + ZPRCalleeSavesSize;
-
- // Allocate ZPR locals
- allocateStackSpace(*ZPRCalleeSavesEnd, RealignmentPadding,
- ZPRLocalsSize + StackOffset::getFixed(NumBytes),
+ allocateStackSpace(PPRRange.End, RealignmentPadding, AllocateAfterPPRs,
EmitAsyncCFI && !HasFP, CFAOffset,
- MFI.hasVarSizedObjects());
+ MFI.hasVarSizedObjects() || ZPR.LocalsSize ||
+ NonSVELocalsSize);
+ CFAOffset += AllocateAfterPPRs;
} else {
- // Allocate space for the callee saves (if any).
- StackOffset LocalsSize =
- PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes);
- if (!FPAfterSVECalleeSaves)
- allocateStackSpace(AfterGPRSavesI, 0, SVECalleeSavesSize,
- EmitAsyncCFI && !HasFP, CFAOffset,
- MFI.hasVarSizedObjects() || LocalsSize);
+ assert(SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord);
+ // Note: With CalleeSavesAboveFrameRecord, the SVE CS have already been
+ // allocated (and separate PPR locals are not supported, all SVE locals,
+ // both PPR and ZPR, are within the ZPR locals area).
+ assert(!PPR.LocalsSize && "Unexpected PPR locals!");
CFAOffset += SVECalleeSavesSize;
+ }
- // Allocate space for the rest of the frame including SVE locals. Align the
- // stack as necessary.
- assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
- "Cannot use redzone with stack realignment");
- if (!AFL.canUseRedZone(MF)) {
- // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
- // the correct value here, as NumBytes also includes padding bytes,
- // which shouldn't be counted here.
- StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize;
- allocateStackSpace(AfterSVESavesI, RealignmentPadding,
- SVELocalsSize + StackOffset::getFixed(NumBytes),
- EmitAsyncCFI && !HasFP, CFAOffset,
- MFI.hasVarSizedObjects());
- }
+ // Allocate space for the rest of the frame including ZPR locals. Align the
+ // stack as necessary.
+ assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
+ "Cannot use redzone with stack realignment");
+ if (!AFL.canUseRedZone(MF)) {
+ // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have the
+ // correct value here, as NumBytes also includes padding bytes, which
+ // shouldn't be counted here.
+ allocateStackSpace(
+ AfterSVESavesI, RealignmentPadding, ZPR.LocalsSize + NonSVELocalsSize,
+ EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects());
}
// If we need a base pointer, set it up here. It's whatever the value of the
- // stack pointer is at this point. Any variable size objects will be allocated
- // after this, so we can still use the base pointer to reference locals.
+ // stack pointer is at this point. Any variable size objects will be
+ // allocated after this, so we can still use the base pointer to reference
+ // locals.
//
// FIXME: Clarify FrameSetup flags here.
// Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
@@ -1270,7 +1278,9 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
StackOffset::getScalable(MFI.getObjectOffset(FI)) -
StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
- if (AFI->hasSplitSVEObjects() &&
+ // The scalable vectors are below (lower address) the scalable predicates
+ // with split SVE objects, so we must subtract the size of the predicates.
+ if (SVELayout == SVEStackLayout::Split &&
MFI.getStackID(FI) == TargetStackID::ScalableVector)
Offset -= PPRStackSize;
@@ -1349,13 +1359,10 @@ void AArch64EpilogueEmitter::emitEpilogue() {
return;
}
- bool FPAfterSVECalleeSaves =
- Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
-
bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes);
// Assume we can't combine the last pop with the sp restore.
bool CombineAfterCSRBump = false;
- if (FPAfterSVECalleeSaves) {
+ if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
AfterCSRPopSize += FixedObject;
} else if (!CombineSPBump && PrologueSaveSize != 0) {
MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
@@ -1390,7 +1397,8 @@ void AArch64EpilogueEmitter::emitEpilogue() {
while (FirstGPRRestoreI != Begin) {
--FirstGPRRestoreI;
if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) ||
- (!FPAfterSVECalleeSaves && isPartOfSVECalleeSaves(FirstGPRRestoreI))) {
+ (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord &&
+ isPartOfSVECalleeSaves(FirstGPRRestoreI))) {
++FirstGPRRestoreI;
break;
} else if (CombineSPBump)
@@ -1414,13 +1422,9 @@ void AArch64EpilogueEmitter::emitEpilogue() {
if (HasFP && AFI->hasSwiftAsyncContext())
emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
- StackOffset ZPRStackSize = AFL.getZPRStackSize(MF);
- StackOffset PPRStackSize = AFL.getPPRStackSize(MF);
- StackOffset SVEStackSize = ZPRStackSize + PPRStackSize;
-
// If there is a single SP update, insert it before the ret and we're done.
if (CombineSPBump) {
- assert(!SVEStackSize && "Cannot combine SP bump with SVE");
+ assert(!AFI->hasSVEStackSize() && "Cannot combine SP bump with SVE");
// When we are about to restore the CSRs, the CFA register is SP again.
if (EmitCFI && HasFP)
@@ -1437,188 +1441,122 @@ void AArch64EpilogueEmitter::emitEpilogue() {
NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
- if (!AFI->hasSplitSVEObjects()) {
- // Process the SVE callee-saves to determine what space needs to be
- // deallocated.
- StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
- MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
- RestoreEnd = FirstGPRRestoreI;
- int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize();
- int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize();
- int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize;
-
- if (SVECalleeSavedSize) {
- if (FPAfterSVECalleeSaves)
- RestoreEnd = MBB.getFirstTerminator();
-
- RestoreBegin = std::prev(RestoreEnd);
- while (RestoreBegin != MBB.begin() &&
- isPartOfSVECalleeSaves(std::prev(RestoreBegin)))
- --RestoreBegin;
-
- assert(isPartOfSVECalleeSaves(RestoreBegin) &&
- isPartOfSVECalleeSaves(std::prev(RestoreEnd)) &&
- "Unexpected instruction");
-
- StackOffset CalleeSavedSizeAsOffset =
- StackOffset::getScalable(SVECalleeSavedSize);
- DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
- DeallocateAfter = CalleeSavedSizeAsOffset;
+ auto [PPR, ZPR] = getSVEStackFrameSizes();
+ auto [PPRRange, ZPRRange] = partitionSVECS(
+ MBB,
+ SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
+ ? MBB.getFirstTerminator()
+ : FirstGPRRestoreI,
+ PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
+
+ StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
+ StackOffset SVEStackSize =
+ SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
+ MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin;
+ MachineBasicBlock::iterator RestoreEnd = PPRRange.End;
+
+ // Deallocate the SVE area.
+ if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+ StackOffset SVELocalsSize = ZPR.LocalsSize + PPR.LocalsSize;
+ // If the callee-save area is before FP, restoring the FP implicitly
+ // deallocates non-callee-save SVE allocations. Otherwise, deallocate them
+ // explicitly.
+ if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
+ emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
+ SVELocalsSize, TII, MachineInstr::FrameDestroy, false,
+ NeedsWinCFI, &HasWinCFI);
}
- // Deallocate the SVE area.
- if (FPAfterSVECalleeSaves) {
- // If the callee-save area is before FP, restoring the FP implicitly
- // deallocates non-callee-save SVE allocations. Otherwise, deallocate
- // them explicitly.
- if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
- emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
- DeallocateBefore, TII, MachineInstr::FrameDestroy,
- false, NeedsWinCFI, &HasWinCFI);
- }
+ // Deallocate callee-save non-SVE registers.
+ emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
- // Deallocate callee-save non-SVE registers.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(AFI->getCalleeSavedStackSize()),
- TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
- &HasWinCFI);
-
- // Deallocate fixed objects.
- emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(FixedObject), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI,
- &HasWinCFI);
-
- // Deallocate callee-save SVE registers.
- emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
- DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI, &HasWinCFI);
- } else if (SVEStackSize) {
- int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
- // If we have stack realignment or variable-sized objects we must use the
- // FP to restore SVE callee saves (as there is an unknown amount of
- // data/padding between the SP and SVE CS area).
- Register BaseForSVEDealloc =
- (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
- : AArch64::SP;
- if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
- Register CalleeSaveBase = AArch64::FP;
- if (int64_t CalleeSaveBaseOffset =
- AFI->getCalleeSaveBaseToFrameRecordOffset()) {
- // If we have have an non-zero offset to the non-SVE CS base we need
- // to compute the base address by subtracting the offest in a
- // temporary register first (to avoid briefly deallocating the SVE
- // CS).
- CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
- &AArch64::GPR64RegClass);
- emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
- StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
- MachineInstr::FrameDestroy);
- }
- // The code below will deallocate the stack space space by moving the
- // SP to the start of the SVE callee-save area.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
- StackOffset::getScalable(-SVECalleeSavedSize), TII,
+ // Deallocate fixed objects.
+ emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(FixedObject), TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
+
+ // Deallocate callee-save SVE registers.
+ emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+ SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
+ NeedsWinCFI, &HasWinCFI);
+ } else if (AFI->hasSVEStackSize()) {
+ // If we have stack realignment or variable-sized objects we must use the FP
+ // to restore SVE callee saves (as there is an unknown amount of
+ // data/padding between the SP and SVE CS area).
+ Register BaseForSVEDealloc =
+ (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
+ : AArch64::SP;
+ if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) {
+ // TODO: Support stack realigment and variable-sized objects.
+ assert(
+ SVELayout != SVEStackLayout::Split &&
+ "unexpected stack realignment or variable sized objects with split "
+ "SVE stack objects");
+
+ Register CalleeSaveBase = AArch64::FP;
+ if (int64_t CalleeSaveBaseOffset =
+ AFI->getCalleeSaveBaseToFrameRecordOffset()) {
+ // If we have have an non-zero offset to the non-SVE CS base we need to
+ // compute the base address by subtracting the offest in a temporary
+ // register first (to avoid briefly deallocating the SVE CS).
+ CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
+ &AArch64::GPR64RegClass);
+ emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
+ StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
MachineInstr::FrameDestroy);
- } else if (BaseForSVEDealloc == AArch64::SP) {
- if (SVECalleeSavedSize) {
- // Deallocate the non-SVE locals first before we can deallocate (and
- // restore callee saves) from the SVE area.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(NumBytes), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI,
- &HasWinCFI, EmitCFI && !HasFP,
- SVEStackSize + StackOffset::getFixed(
- NumBytes + PrologueSaveSize));
- NumBytes = 0;
- }
-
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- DeallocateBefore, TII, MachineInstr::FrameDestroy,
- false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
- SVEStackSize +
- StackOffset::getFixed(NumBytes + PrologueSaveSize));
-
- emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
- DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
- DeallocateAfter +
- StackOffset::getFixed(NumBytes + PrologueSaveSize));
+ }
+ // The code below will deallocate the stack space space by moving the SP
+ // to the start of the SVE callee-save area.
+ emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
+ -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy);
+ } else if (BaseForSVEDealloc == AArch64::SP) {
+ auto CFAOffset =
+ SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
+
+ if (SVECalleeSavesSize) {
+ // Deallocate the non-SVE locals first before we can deallocate (and
+ // restore callee saves) from the SVE area.
+ auto NonSVELocals = StackOffset::getFixed(NumBytes);
+ emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+ NonSVELocals, TII, MachineInstr::FrameDestroy, false,
+ NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
+ CFAOffset -= NonSVELocals;
+ NumBytes = 0;
}
- if (EmitCFI)
- emitCalleeSavedSVERestores(RestoreEnd);
- }
- } else if (AFI->hasSplitSVEObjects() && SVEStackSize) {
- // TODO: Support stack realigment and variable-sized objects.
- assert(!AFI->isStackRealigned() && !MFI.hasVarSizedObjects() &&
- "unexpected stack realignment or variable sized objects with split "
- "SVE stack objects");
- // SplitSVEObjects. Determine the sizes and starts/ends of the ZPR and PPR
- // areas.
- auto ZPRCalleeSavedSize =
- StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
- auto PPRCalleeSavedSize =
- StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
- StackOffset PPRLocalsSize = PPRStackSize - PPRCalleeSavedSize;
- StackOffset ZPRLocalsSize = ZPRStackSize - ZPRCalleeSavedSize;
-
- MachineBasicBlock::iterator PPRRestoreBegin = FirstGPRRestoreI,
- PPRRestoreEnd = FirstGPRRestoreI;
- if (PPRCalleeSavedSize) {
- PPRRestoreBegin = std::prev(PPRRestoreEnd);
- while (PPRRestoreBegin != MBB.begin() &&
- isPartOfPPRCalleeSaves(std::prev(PPRRestoreBegin)))
- --PPRRestoreBegin;
- }
-
- MachineBasicBlock::iterator ZPRRestoreBegin = PPRRestoreBegin,
- ZPRRestoreEnd = PPRRestoreBegin;
- if (ZPRCalleeSavedSize) {
- ZPRRestoreBegin = std::prev(ZPRRestoreEnd);
- while (ZPRRestoreBegin != MBB.begin() &&
- isPartOfZPRCalleeSaves(std::prev(ZPRRestoreBegin)))
- --ZPRRestoreBegin;
- }
-
- auto CFAOffset =
- SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
- if (PPRCalleeSavedSize || ZPRCalleeSavedSize) {
- // Deallocate the non-SVE locals first before we can deallocate (and
- // restore callee saves) from the SVE area.
- auto NonSVELocals = StackOffset::getFixed(NumBytes);
- emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
- NonSVELocals, TII, MachineInstr::FrameDestroy, false,
- false, nullptr, EmitCFI && !HasFP, CFAOffset);
- NumBytes = 0;
- CFAOffset -= NonSVELocals;
- }
+ if (ZPR.LocalsSize) {
+ emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+ ZPR.LocalsSize, TII, MachineInstr::FrameDestroy, false,
+ NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
+ CFAOffset -= ZPR.LocalsSize;
+ }
- if (ZPRLocalsSize) {
- emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
- ZPRLocalsSize, TII, MachineInstr::FrameDestroy, false,
- false, nullptr, EmitCFI && !HasFP, CFAOffset);
- CFAOffset -= ZPRLocalsSize;
- }
+ StackOffset SVECalleeSavesToDealloc = SVECalleeSavesSize;
+ if (SVELayout == SVEStackLayout::Split &&
+ (PPR.LocalsSize || ZPR.CalleeSavesSize)) {
+ assert(PPRRange.Begin == ZPRRange.End &&
+ "Expected PPR restores after ZPR");
+ emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+ PPR.LocalsSize + ZPR.CalleeSavesSize, TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI,
+ &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
+ CFAOffset -= PPR.LocalsSize + ZPR.CalleeSavesSize;
+ SVECalleeSavesToDealloc -= ZPR.CalleeSavesSize;
+ }
- if (PPRLocalsSize || ZPRCalleeSavedSize) {
- assert(PPRRestoreBegin == ZPRRestoreEnd &&
- "Expected PPR restores after ZPR");
- emitFrameOffset(MBB, PPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
- PPRLocalsSize + ZPRCalleeSavedSize, TII,
- MachineInstr::FrameDestroy, false, false, nullptr,
- EmitCFI && !HasFP, CFAOffset);
- CFAOffset -= PPRLocalsSize + ZPRCalleeSavedSize;
- }
- if (PPRCalleeSavedSize) {
- emitFrameOffset(MBB, PPRRestoreEnd, DL, AArch64::SP, AArch64::SP,
- PPRCalleeSavedSize, TII, MachineInstr::FrameDestroy,
- false, false, nullptr, EmitCFI && !HasFP, CFAOffset);
+ // If split SVE is on, this dealloc PPRs, otherwise, deallocs ZPRs + PPRs:
+ if (SVECalleeSavesToDealloc)
+ emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP,
+ SVECalleeSavesToDealloc, TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI,
+ &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
}
- // We only emit CFI information for ZPRs so emit CFI after the ZPR restores.
if (EmitCFI)
- emitCalleeSavedSVERestores(ZPRRestoreEnd);
+ emitCalleeSavedSVERestores(
+ SVELayout == SVEStackLayout::Split ? ZPRRange.End : PPRRange.End);
}
if (!HasFP) {
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
index a1c9b34..bccadda 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
@@ -27,11 +27,23 @@ class AArch64Subtarget;
class AArch64FunctionInfo;
class AArch64FrameLowering;
+struct SVEFrameSizes {
+ struct {
+ StackOffset CalleeSavesSize, LocalsSize;
+ } PPR, ZPR;
+};
+
class AArch64PrologueEpilogueCommon {
public:
AArch64PrologueEpilogueCommon(MachineFunction &MF, MachineBasicBlock &MBB,
const AArch64FrameLowering &AFL);
+ enum class SVEStackLayout {
+ Default,
+ Split,
+ CalleeSavesAboveFrameRecord,
+ };
+
protected:
bool requiresGetVGCall() const;
@@ -53,6 +65,8 @@ protected:
bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const;
+ SVEFrameSizes getSVEStackFrameSizes() const;
+
MachineFunction &MF;
MachineBasicBlock &MBB;
@@ -68,6 +82,7 @@ protected:
bool IsFunclet = false; // Note: Set in derived constructors.
bool NeedsWinCFI = false; // Note: Can be changed in emitFramePointerSetup.
bool HomPrologEpilog = false; // Note: Set in derived constructors.
+ SVEStackLayout SVELayout = SVEStackLayout::Default;
// Note: "HasWinCFI" is mutable as it can change in any "emit" function.
mutable bool HasWinCFI = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0f2c335..ce2b4a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -562,6 +562,11 @@ public:
void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
+struct AMDGPUUniformIntrinsicCombinePass
+ : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 9449e70..a6074ea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -30,6 +30,7 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
+MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c7a91f4c..4958a20 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -526,6 +526,11 @@ static cl::opt<bool> HasClosedWorldAssumption(
cl::desc("Whether has closed-world assumption at link time"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> EnableUniformIntrinsicCombine(
+ "amdgpu-enable-uniform-intrinsic-combine",
+ cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
+ cl::init(true), cl::Hidden);
+
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -879,6 +884,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());
+
+ if (EnableUniformIntrinsicCombine)
+ PM.addPass(AMDGPUUniformIntrinsicCombinePass());
});
PB.registerPeepholeEPCallback(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
new file mode 100644
index 0000000..50c78d8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -0,0 +1,159 @@
+//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass simplifies certain intrinsic calls when the arguments are uniform.
+/// It's true that this pass has transforms that can lead to a situation where
+/// some instruction whose operand was previously recognized as statically
+/// uniform is later on no longer recognized as statically uniform. However, the
+/// semantics of how programs execute don't (and must not, for this precise
+/// reason) care about static uniformity, they only ever care about dynamic
+/// uniformity. And every instruction that's downstream and cares about dynamic
+/// uniformity must be convergent (and isel will introduce v_readfirstlane for
+/// them if their operands can't be proven statically uniform).
+///
+/// This pass is implemented as a ModulePass because intrinsic declarations
+/// exist at the module scope, allowing us to skip processing entirely if no
+/// declarations are present and to traverse their user lists directly when
+/// they are. A FunctionPass would instead require scanning every instruction
+/// in every function to find relevant intrinsics, which is far less efficient.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+using namespace llvm::PatternMatch;
+
+/// Wrapper for querying uniformity info that first checks locally tracked
+/// instructions.
+static bool
+isDivergentUseWithNew(const Use &U, const UniformityInfo &UI,
+ const ValueMap<const Value *, bool> &Tracker) {
+ Value *V = U.get();
+ if (auto It = Tracker.find(V); It != Tracker.end())
+ return !It->second; // divergent if marked false
+ return UI.isDivergentUse(U);
+}
+
+/// Optimizes uniform intrinsics calls if their operand can be proven uniform.
+static bool optimizeUniformIntrinsic(IntrinsicInst &II,
+ const UniformityInfo &UI,
+ ValueMap<const Value *, bool> &Tracker) {
+ llvm::Intrinsic::ID IID = II.getIntrinsicID();
+
+ switch (IID) {
+ case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane: {
+ Value *Src = II.getArgOperand(0);
+ if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
+ return false;
+ LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
+ II.replaceAllUsesWith(Src);
+ II.eraseFromParent();
+ return true;
+ }
+ case Intrinsic::amdgcn_ballot: {
+ Value *Src = II.getArgOperand(0);
+ if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
+ return false;
+ LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
+
+ bool Changed = false;
+ for (User *U : make_early_inc_range(II.users())) {
+ if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
+ Value *Op0 = ICmp->getOperand(0);
+ Value *Op1 = ICmp->getOperand(1);
+ ICmpInst::Predicate Pred = ICmp->getPredicate();
+ Value *OtherOp = Op0 == &II ? Op1 : Op0;
+
+ if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
+ // Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1
+ Instruction *NotOp =
+ BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
+ Tracker[NotOp] = true; // NOT preserves uniformity
+ LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
+ ICmp->replaceAllUsesWith(NotOp);
+ ICmp->eraseFromParent();
+ Changed = true;
+ } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
+ // Case: (icmp ne %ballot, 0) -> %ballot_arg
+ LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
+ << *Src << '\n');
+ ICmp->replaceAllUsesWith(Src);
+ ICmp->eraseFromParent();
+ Changed = true;
+ }
+ }
+ }
+ // Erase the intrinsic if it has no remaining uses.
+ if (II.use_empty())
+ II.eraseFromParent();
+ return Changed;
+ }
+ default:
+ llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
+ }
+ return false;
+}
+
+/// Iterates over intrinsic declarations in the module to optimize their uses.
+static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
+ bool IsChanged = false;
+ ValueMap<const Value *, bool> Tracker;
+
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ for (Function &F : M) {
+ switch (F.getIntrinsicID()) {
+ case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_ballot:
+ break;
+ default:
+ continue;
+ }
+
+ for (User *U : make_early_inc_range(F.users())) {
+ auto *II = cast<IntrinsicInst>(U);
+ Function *ParentF = II->getFunction();
+ const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
+ IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
+ }
+ }
+ return IsChanged;
+}
+
+PreservedAnalyses
+AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
+ if (!runUniformIntrinsicCombine(M, AM))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<UniformityInfoAnalysis>();
+ return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index aae56ee..13f727b68 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -64,6 +64,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
+ AMDGPUUniformIntrinsicCombine.cpp
AMDGPUInstrInfo.cpp
AMDGPUInstructionSelector.cpp
AMDGPUISelDAGToDAG.cpp
diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td
index 6d0529f..fb0928b8 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/llvm/lib/Target/Hexagon/Hexagon.td
@@ -110,8 +110,6 @@ def FeatureSmallData: SubtargetFeature<"small-data", "UseSmallData", "true",
"Allow GP-relative addressing of global variables">;
def FeatureDuplex: SubtargetFeature<"duplex", "EnableDuplex", "true",
"Enable generation of duplex instruction">;
-def FeatureUnsafeFP: SubtargetFeature<"unsafe-fp", "UseUnsafeMath", "true",
- "Use unsafe FP math">;
def FeatureReservedR19: SubtargetFeature<"reserved-r19", "ReservedR19",
"true", "Reserve register R19">;
def FeatureNoreturnStackElim: SubtargetFeature<"noreturn-stack-elim",
@@ -167,7 +165,6 @@ def UseHVXQFloat : Predicate<"HST->useHVXQFloatOps()">,
def UseHVXFloatingPoint: Predicate<"HST->useHVXFloatingPoint()">;
def HasMemNoShuf : Predicate<"HST->hasMemNoShuf()">,
AssemblerPredicate<(all_of FeatureMemNoShuf)>;
-def UseUnsafeMath : Predicate<"HST->useUnsafeMath()">;
def NotOptTinyCore : Predicate<"!HST->isTinyCore() ||"
"MF->getFunction().hasOptSize()"> {
let RecomputePerFunction = 1;
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 4b23670..a0acfcf 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -1611,8 +1611,11 @@ def DfMpy: OutPatFrag<(ops node:$Rs, node:$Rt),
$Rt, $Rs),
$Rs, $Rt)>;
-let Predicates = [HasV67,UseUnsafeMath], AddedComplexity = 50 in {
- def: Pat<(fmul F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>;
+def fmul_afn : PatFrag<(ops node:$a, node:$b), (fmul node:$a, node:$b), [{
+ return N->getFlags().hasApproximateFuncs();
+}]>;
+let Predicates = [HasV67], AddedComplexity = 50 in {
+ def : Pat<(fmul_afn F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>;
}
let Predicates = [HasV67] in {
def: OpR_RR_pat<F2_dfmin, pf2<fminimumnum>, f64, F64>;
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index b111471..7430567 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -54,7 +54,6 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
bool UseNewValueJumps = false;
bool UseNewValueStores = false;
bool UseSmallData = false;
- bool UseUnsafeMath = false;
bool UseZRegOps = false;
bool UseHVXIEEEFPOps = false;
bool UseHVXQFloatOps = false;
@@ -234,7 +233,6 @@ public:
bool useNewValueJumps() const { return UseNewValueJumps; }
bool useNewValueStores() const { return UseNewValueStores; }
bool useSmallData() const { return UseSmallData; }
- bool useUnsafeMath() const { return UseUnsafeMath; }
bool useZRegOps() const { return UseZRegOps; }
bool useCabac() const { return UseCabac; }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 0afa04a..f5d8b69 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -250,13 +250,6 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
std::string FS =
FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
- // Append the preexisting target features last, so that +mattr overrides
- // the "unsafe-fp-math" function attribute.
- // Creating a separate target feature is not strictly necessary, it only
- // exists to make "unsafe-fp-math" force creating a new subtarget.
-
- if (F.getFnAttribute("unsafe-fp-math").getValueAsBool())
- FS = FS.empty() ? "+unsafe-fp" : "+unsafe-fp," + FS;
auto &I = SubtargetMap[CPU + FS];
if (!I) {
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 5b8ea15..b74a070 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -1084,8 +1084,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
auto ThenTerm = SplitBlockAndInsertIfThen(
IRB.CreateIsNull(Load), &*IP, false,
MDBuilder(IRB.getContext()).createUnlikelyBranchWeights());
- IRBuilder<> ThenIRB(ThenTerm);
+ InstrumentationIRBuilder ThenIRB(ThenTerm);
auto Store = ThenIRB.CreateStore(ConstantInt::getTrue(Int1Ty), FlagPtr);
+ if (EntryLoc)
+ Store->setDebugLoc(EntryLoc);
Load->setNoSanitizeMetadata();
Store->setNoSanitizeMetadata();
}
@@ -1131,7 +1133,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
EstimatedStackSize >= Options.StackDepthCallbackMin) {
if (InsertBefore)
IRB.SetInsertPoint(InsertBefore);
- IRB.CreateCall(SanCovStackDepthCallback)->setCannotMerge();
+ auto Call = IRB.CreateCall(SanCovStackDepthCallback);
+ if (EntryLoc)
+ Call->setDebugLoc(EntryLoc);
+ Call->setCannotMerge();
}
} else {
// Check stack depth. If it's the deepest so far, record it.
@@ -1144,8 +1149,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
auto ThenTerm = SplitBlockAndInsertIfThen(
IsStackLower, &*IP, false,
MDBuilder(IRB.getContext()).createUnlikelyBranchWeights());
- IRBuilder<> ThenIRB(ThenTerm);
+ InstrumentationIRBuilder ThenIRB(ThenTerm);
auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack);
+ if (EntryLoc)
+ Store->setDebugLoc(EntryLoc);
LowestStack->setNoSanitizeMetadata();
Store->setNoSanitizeMetadata();
}
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index e448230..3f7003d 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -61,6 +61,7 @@
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DomTreeUpdater.h"
@@ -382,16 +383,9 @@ typedef DenseMap<BasicBlock *, CloneList> DuplicateBlockMap;
typedef MapVector<Instruction *, std::vector<Instruction *>> DefMap;
inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) {
- OS << "< ";
- for (const BasicBlock *BB : Path) {
- std::string BBName;
- if (BB->hasName())
- raw_string_ostream(BBName) << BB->getName();
- else
- raw_string_ostream(BBName) << BB;
- OS << BBName << " ";
- }
- OS << ">";
+ auto BBNames = llvm::map_range(
+ Path, [](const BasicBlock *BB) { return BB->getNameOrAsOperand(); });
+ OS << "< " << llvm::join(BBNames, ", ") << " >";
return OS;
}
@@ -423,7 +417,7 @@ struct ThreadingPath {
}
void print(raw_ostream &OS) const {
- OS << Path << " [ " << ExitVal << ", " << DBB->getName() << " ]";
+ OS << Path << " [ " << ExitVal << ", " << DBB->getNameOrAsOperand() << " ]";
}
private: