aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rwxr-xr-xllvm/lib/Analysis/ConstantFolding.cpp25
-rw-r--r--llvm/lib/Analysis/IR2Vec.cpp180
-rw-r--r--llvm/lib/Analysis/Loads.cpp4
-rw-r--r--llvm/lib/Analysis/ScalarEvolution.cpp46
-rw-r--r--llvm/lib/CodeGen/MachinePipeliner.cpp25
-rw-r--r--llvm/lib/CodeGen/RegisterCoalescer.cpp6
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp51
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp32
-rw-r--r--llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp11
-rw-r--r--llvm/lib/IR/ConstantFPRange.cpp20
-rw-r--r--llvm/lib/IR/DIBuilder.cpp19
-rw-r--r--llvm/lib/IR/DiagnosticInfo.cpp7
-rw-r--r--llvm/lib/Support/Mustache.cpp180
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp12
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp526
-rw-r--r--llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h15
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td8
-rw-r--r--llvm/lib/Target/Hexagon/Hexagon.td3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonPatterns.td28
-rw-r--r--llvm/lib/Target/Hexagon/HexagonSubtarget.h2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp7
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td5
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZb.td94
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrPredicates.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVProcessors.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSiFive7.td142
-rw-r--r--llvm/lib/Target/RISCV/RISCVScheduleV.td16
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp6
-rw-r--r--llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp7
-rw-r--r--llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp7
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX10.td4
-rw-r--r--llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp2
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp34
-rw-r--r--llvm/lib/Transforms/IPO/PartialInlining.cpp2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp24
-rw-r--r--llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp13
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp154
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp2
44 files changed, 957 insertions, 821 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index b744537..45c889c 100755
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -329,6 +329,7 @@ bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
// Look through ptr->int and ptr->ptr casts.
if (CE->getOpcode() == Instruction::PtrToInt ||
+ CE->getOpcode() == Instruction::PtrToAddr ||
CE->getOpcode() == Instruction::BitCast)
return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, DL,
DSOEquiv);
@@ -1495,22 +1496,22 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
default:
llvm_unreachable("Missing case");
case Instruction::PtrToAddr:
- // TODO: Add some of the ptrtoint folds here as well.
- break;
case Instruction::PtrToInt:
if (auto *CE = dyn_cast<ConstantExpr>(C)) {
Constant *FoldedValue = nullptr;
- // If the input is a inttoptr, eliminate the pair. This requires knowing
+ // If the input is an inttoptr, eliminate the pair. This requires knowing
// the width of a pointer, so it can't be done in ConstantExpr::getCast.
if (CE->getOpcode() == Instruction::IntToPtr) {
- // zext/trunc the inttoptr to pointer size.
- FoldedValue = ConstantFoldIntegerCast(CE->getOperand(0),
- DL.getIntPtrType(CE->getType()),
+ // zext/trunc the inttoptr to pointer/address size.
+ Type *MidTy = Opcode == Instruction::PtrToInt
+ ? DL.getAddressType(CE->getType())
+ : DL.getIntPtrType(CE->getType());
+ FoldedValue = ConstantFoldIntegerCast(CE->getOperand(0), MidTy,
/*IsSigned=*/false, DL);
} else if (auto *GEP = dyn_cast<GEPOperator>(CE)) {
// If we have GEP, we can perform the following folds:
- // (ptrtoint (gep null, x)) -> x
- // (ptrtoint (gep (gep null, x), y) -> x + y, etc.
+ // (ptrtoint/ptrtoaddr (gep null, x)) -> x
+ // (ptrtoint/ptrtoaddr (gep (gep null, x), y) -> x + y, etc.
unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
APInt BaseOffset(BitWidth, 0);
auto *Base = cast<Constant>(GEP->stripAndAccumulateConstantOffsets(
@@ -1518,7 +1519,8 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
if (Base->isNullValue()) {
FoldedValue = ConstantInt::get(CE->getContext(), BaseOffset);
} else {
- // ptrtoint (gep i8, Ptr, (sub 0, V)) -> sub (ptrtoint Ptr), V
+ // ptrtoint/ptrtoaddr (gep i8, Ptr, (sub 0, V))
+ // -> sub (ptrtoint/ptrtoaddr Ptr), V
if (GEP->getNumIndices() == 1 &&
GEP->getSourceElementType()->isIntegerTy(8)) {
auto *Ptr = cast<Constant>(GEP->getPointerOperand());
@@ -1528,12 +1530,13 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
Sub->getOpcode() == Instruction::Sub &&
Sub->getOperand(0)->isNullValue())
FoldedValue = ConstantExpr::getSub(
- ConstantExpr::getPtrToInt(Ptr, IntIdxTy), Sub->getOperand(1));
+ ConstantExpr::getCast(Opcode, Ptr, IntIdxTy),
+ Sub->getOperand(1));
}
}
}
if (FoldedValue) {
- // Do a zext or trunc to get to the ptrtoint dest size.
+ // Do a zext or trunc to get to the ptrtoint/ptrtoaddr dest size.
return ConstantFoldIntegerCast(FoldedValue, DestTy, /*IsSigned=*/false,
DL);
}
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 1794a60..85b5372 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -153,11 +153,6 @@ void Embedding::print(raw_ostream &OS) const {
// Embedder and its subclasses
//===----------------------------------------------------------------------===//
-Embedder::Embedder(const Function &F, const Vocabulary &Vocab)
- : F(F), Vocab(Vocab), Dimension(Vocab.getDimension()),
- OpcWeight(::OpcWeight), TypeWeight(::TypeWeight), ArgWeight(::ArgWeight),
- FuncVector(Embedding(Dimension)) {}
-
std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F,
const Vocabulary &Vocab) {
switch (Mode) {
@@ -169,110 +164,85 @@ std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F,
return nullptr;
}
-const InstEmbeddingsMap &Embedder::getInstVecMap() const {
- if (InstVecMap.empty())
- computeEmbeddings();
- return InstVecMap;
-}
-
-const BBEmbeddingsMap &Embedder::getBBVecMap() const {
- if (BBVecMap.empty())
- computeEmbeddings();
- return BBVecMap;
-}
-
-const Embedding &Embedder::getBBVector(const BasicBlock &BB) const {
- auto It = BBVecMap.find(&BB);
- if (It != BBVecMap.end())
- return It->second;
- computeEmbeddings(BB);
- return BBVecMap[&BB];
-}
+Embedding Embedder::computeEmbeddings() const {
+ Embedding FuncVector(Dimension, 0.0);
-const Embedding &Embedder::getFunctionVector() const {
- // Currently, we always (re)compute the embeddings for the function.
- // This is cheaper than caching the vector.
- computeEmbeddings();
- return FuncVector;
-}
-
-void Embedder::computeEmbeddings() const {
if (F.isDeclaration())
- return;
-
- FuncVector = Embedding(Dimension, 0.0);
+ return FuncVector;
// Consider only the basic blocks that are reachable from entry
- for (const BasicBlock *BB : depth_first(&F)) {
- computeEmbeddings(*BB);
- FuncVector += BBVecMap[BB];
- }
+ for (const BasicBlock *BB : depth_first(&F))
+ FuncVector += computeEmbeddings(*BB);
+ return FuncVector;
}
-void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
+Embedding Embedder::computeEmbeddings(const BasicBlock &BB) const {
Embedding BBVector(Dimension, 0);
// We consider only the non-debug and non-pseudo instructions
- for (const auto &I : BB.instructionsWithoutDebug()) {
- Embedding ArgEmb(Dimension, 0);
- for (const auto &Op : I.operands())
- ArgEmb += Vocab[*Op];
- auto InstVector =
- Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
- if (const auto *IC = dyn_cast<CmpInst>(&I))
- InstVector += Vocab[IC->getPredicate()];
- InstVecMap[&I] = InstVector;
- BBVector += InstVector;
- }
- BBVecMap[&BB] = BBVector;
-}
-
-void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const {
- Embedding BBVector(Dimension, 0);
+ for (const auto &I : BB.instructionsWithoutDebug())
+ BBVector += computeEmbeddings(I);
+ return BBVector;
+}
+
+Embedding SymbolicEmbedder::computeEmbeddings(const Instruction &I) const {
+ // Currently, we always (re)compute the embeddings for symbolic embedder.
+ // This is cheaper than caching the vectors.
+ Embedding ArgEmb(Dimension, 0);
+ for (const auto &Op : I.operands())
+ ArgEmb += Vocab[*Op];
+ auto InstVector =
+ Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+ if (const auto *IC = dyn_cast<CmpInst>(&I))
+ InstVector += Vocab[IC->getPredicate()];
+ return InstVector;
+}
+
+Embedding FlowAwareEmbedder::computeEmbeddings(const Instruction &I) const {
+ // If we have already computed the embedding for this instruction, return it
+ auto It = InstVecMap.find(&I);
+ if (It != InstVecMap.end())
+ return It->second;
- // We consider only the non-debug and non-pseudo instructions
- for (const auto &I : BB.instructionsWithoutDebug()) {
- // TODO: Handle call instructions differently.
- // For now, we treat them like other instructions
- Embedding ArgEmb(Dimension, 0);
- for (const auto &Op : I.operands()) {
- // If the operand is defined elsewhere, we use its embedding
- if (const auto *DefInst = dyn_cast<Instruction>(Op)) {
- auto DefIt = InstVecMap.find(DefInst);
- // Fixme (#159171): Ideally we should never miss an instruction
- // embedding here.
- // But when we have cyclic dependencies (e.g., phi
- // nodes), we might miss the embedding. In such cases, we fall back to
- // using the vocabulary embedding. This can be fixed by iterating to a
- // fixed-point, or by using a simple solver for the set of simultaneous
- // equations.
- // Another case when we might miss an instruction embedding is when
- // the operand instruction is in a different basic block that has not
- // been processed yet. This can be fixed by processing the basic blocks
- // in a topological order.
- if (DefIt != InstVecMap.end())
- ArgEmb += DefIt->second;
- else
- ArgEmb += Vocab[*Op];
- }
- // If the operand is not defined by an instruction, we use the vocabulary
- else {
- LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: "
- << *Op << "=" << Vocab[*Op][0] << "\n");
+ // TODO: Handle call instructions differently.
+ // For now, we treat them like other instructions
+ Embedding ArgEmb(Dimension, 0);
+ for (const auto &Op : I.operands()) {
+ // If the operand is defined elsewhere, we use its embedding
+ if (const auto *DefInst = dyn_cast<Instruction>(Op)) {
+ auto DefIt = InstVecMap.find(DefInst);
+ // Fixme (#159171): Ideally we should never miss an instruction
+ // embedding here.
+ // But when we have cyclic dependencies (e.g., phi
+ // nodes), we might miss the embedding. In such cases, we fall back to
+ // using the vocabulary embedding. This can be fixed by iterating to a
+ // fixed-point, or by using a simple solver for the set of simultaneous
+ // equations.
+ // Another case when we might miss an instruction embedding is when
+ // the operand instruction is in a different basic block that has not
+ // been processed yet. This can be fixed by processing the basic blocks
+ // in a topological order.
+ if (DefIt != InstVecMap.end())
+ ArgEmb += DefIt->second;
+ else
ArgEmb += Vocab[*Op];
- }
}
- // Create the instruction vector by combining opcode, type, and arguments
- // embeddings
- auto InstVector =
- Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
- // Add compare predicate embedding as an additional operand if applicable
- if (const auto *IC = dyn_cast<CmpInst>(&I))
- InstVector += Vocab[IC->getPredicate()];
- InstVecMap[&I] = InstVector;
- BBVector += InstVector;
+ // If the operand is not defined by an instruction, we use the
+ // vocabulary
+ else {
+ LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: "
+ << *Op << "=" << Vocab[*Op][0] << "\n");
+ ArgEmb += Vocab[*Op];
+ }
}
- BBVecMap[&BB] = BBVector;
+ // Create the instruction vector by combining opcode, type, and arguments
+ // embeddings
+ auto InstVector =
+ Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+ if (const auto *IC = dyn_cast<CmpInst>(&I))
+ InstVector += Vocab[IC->getPredicate()];
+ InstVecMap[&I] = InstVector;
+ return InstVector;
}
// ==----------------------------------------------------------------------===//
@@ -695,25 +665,17 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
Emb->getFunctionVector().print(OS);
OS << "Basic block vectors:\n";
- const auto &BBMap = Emb->getBBVecMap();
for (const BasicBlock &BB : F) {
- auto It = BBMap.find(&BB);
- if (It != BBMap.end()) {
- OS << "Basic block: " << BB.getName() << ":\n";
- It->second.print(OS);
- }
+ OS << "Basic block: " << BB.getName() << ":\n";
+ Emb->getBBVector(BB).print(OS);
}
OS << "Instruction vectors:\n";
- const auto &InstMap = Emb->getInstVecMap();
for (const BasicBlock &BB : F) {
for (const Instruction &I : BB) {
- auto It = InstMap.find(&I);
- if (It != InstMap.end()) {
- OS << "Instruction: ";
- I.print(OS);
- It->second.print(OS);
- }
+ OS << "Instruction: ";
+ I.print(OS);
+ Emb->getInstVector(I).print(OS);
}
}
}
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 4c2e1fe..54f55b2 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -812,7 +812,9 @@ static bool isPointerUseReplacable(const Use &U) {
auto *User = Worklist.pop_back_val();
if (!Visited.insert(User).second)
continue;
- if (isa<ICmpInst, PtrToIntInst>(User))
+ // FIXME: The PtrToIntInst case here is not strictly correct, as it
+ // changes which provenance is exposed.
+ if (isa<ICmpInst, PtrToIntInst, PtrToAddrInst>(User))
continue;
if (isa<PHINode, SelectInst>(User))
Worklist.append(User->user_begin(), User->user_end());
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 6f6776c..30bcff7 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -15749,51 +15749,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
return RewriteMap.lookup_or(S, S);
};
- // Check for the SCEV expression (A /u B) * B while B is a constant, inside
- // \p Expr. The check is done recuresively on \p Expr, which is assumed to
- // be a composition of Min/Max SCEVs. Return whether the SCEV expression (A
- // /u B) * B was found, and return the divisor B in \p DividesBy. For
- // example, if Expr = umin (umax ((A /u 8) * 8, 16), 64), return true since
- // (A /u 8) * 8 matched the pattern, and return the constant SCEV 8 in \p
- // DividesBy.
- std::function<bool(const SCEV *, const SCEV *&)> HasDivisibiltyInfo =
- [&](const SCEV *Expr, const SCEV *&DividesBy) {
- if (auto *Mul = dyn_cast<SCEVMulExpr>(Expr)) {
- if (Mul->getNumOperands() != 2)
- return false;
- auto *MulLHS = Mul->getOperand(0);
- auto *MulRHS = Mul->getOperand(1);
- if (isa<SCEVConstant>(MulLHS))
- std::swap(MulLHS, MulRHS);
- if (auto *Div = dyn_cast<SCEVUDivExpr>(MulLHS))
- if (Div->getOperand(1) == MulRHS) {
- DividesBy = MulRHS;
- return true;
- }
- }
- if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr))
- return HasDivisibiltyInfo(MinMax->getOperand(0), DividesBy) ||
- HasDivisibiltyInfo(MinMax->getOperand(1), DividesBy);
- return false;
- };
-
- // Return true if Expr known to divide by \p DividesBy.
- std::function<bool(const SCEV *, const SCEV *&)> IsKnownToDivideBy =
- [&](const SCEV *Expr, const SCEV *DividesBy) {
- if (SE.getURemExpr(Expr, DividesBy)->isZero())
- return true;
- if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr))
- return IsKnownToDivideBy(MinMax->getOperand(0), DividesBy) &&
- IsKnownToDivideBy(MinMax->getOperand(1), DividesBy);
- return false;
- };
-
const SCEV *RewrittenLHS = GetMaybeRewritten(LHS);
const SCEV *DividesBy = nullptr;
- if (HasDivisibiltyInfo(RewrittenLHS, DividesBy))
- // Check that the whole expression is divided by DividesBy
- DividesBy =
- IsKnownToDivideBy(RewrittenLHS, DividesBy) ? DividesBy : nullptr;
+ const APInt &Multiple = SE.getConstantMultiple(RewrittenLHS);
+ if (!Multiple.isOne())
+ DividesBy = SE.getConstant(Multiple);
// Collect rewrites for LHS and its transitive operands based on the
// condition.
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 3a9651c..89ed4da 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -110,6 +110,7 @@ STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
+STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores");
/// A command line option to turn software pipelining on or off.
static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
@@ -193,6 +194,13 @@ static cl::opt<bool>
MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
cl::desc("Use the MVE code generator for software pipelining"));
+/// A command line argument to limit the number of store instructions in the
+/// target basic block.
+static cl::opt<unsigned> SwpMaxNumStores(
+ "pipeliner-max-num-stores",
+ cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
+ cl::init(200));
+
namespace llvm {
// A command line option to enable the CopyToPhi DAG mutation.
@@ -544,6 +552,23 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
return false;
}
+ unsigned NumStores = 0;
+ for (MachineInstr &MI : *L.getHeader())
+ if (MI.mayStore())
+ ++NumStores;
+ if (NumStores > SwpMaxNumStores) {
+ LLVM_DEBUG(dbgs() << "Too many stores\n");
+ NumFailTooManyStores++;
+ ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+ L.getStartLoc(), L.getHeader())
+ << "Too many store instructions in the loop: "
+ << ore::NV("NumStores", NumStores) << " > "
+ << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << ".";
+ });
+ return false;
+ }
+
// Remove any subregisters from inputs to phi nodes.
preprocessPhiNodes(*L.getHeader());
return true;
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index ebfea8e..e17a214 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -2051,6 +2051,12 @@ bool RegisterCoalescer::joinCopy(
}
if (CP.getNewRC()) {
+ if (RegClassInfo.getNumAllocatableRegs(CP.getNewRC()) == 0) {
+ LLVM_DEBUG(dbgs() << "\tNo " << TRI->getRegClassName(CP.getNewRC())
+ << "are available for allocation\n");
+ return false;
+ }
+
auto SrcRC = MRI->getRegClass(CP.getSrcReg());
auto DstRC = MRI->getRegClass(CP.getDstReg());
unsigned SrcIdx = CP.getSrcIdx();
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c5c3866..5ffdc4e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -19340,8 +19340,10 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) {
EVT VT = N->getValueType(0);
const SDNodeFlags Flags = N->getFlags();
unsigned Opc = N->getOpcode();
- bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM;
- bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM;
+ bool PropAllNaNsToQNaNs = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM;
+ bool PropOnlySNaNsToQNaNs = Opc == ISD::FMINNUM || Opc == ISD::FMAXNUM;
+ bool IsMin =
+ Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM || Opc == ISD::FMINIMUMNUM;
SelectionDAG::FlagInserter FlagsInserter(DAG, N);
// Constant fold.
@@ -19356,34 +19358,53 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) {
if (const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1)) {
const APFloat &AF = N1CFP->getValueAPF();
- // minnum(X, nan) -> X
- // maxnum(X, nan) -> X
- // minimum(X, nan) -> nan
- // maximum(X, nan) -> nan
- if (AF.isNaN())
- return PropagatesNaN ? N->getOperand(1) : N->getOperand(0);
+ // minnum(X, qnan) -> X
+ // maxnum(X, qnan) -> X
+ // minnum(X, snan) -> qnan
+ // maxnum(X, snan) -> qnan
+ // minimum(X, nan) -> qnan
+ // maximum(X, nan) -> qnan
+ // minimumnum(X, nan) -> X
+ // maximumnum(X, nan) -> X
+ if (AF.isNaN()) {
+ if (PropAllNaNsToQNaNs || (AF.isSignaling() && PropOnlySNaNsToQNaNs)) {
+ if (AF.isSignaling())
+ return DAG.getConstantFP(AF.makeQuiet(), SDLoc(N), VT);
+ return N->getOperand(1);
+ }
+ return N->getOperand(0);
+ }
// In the following folds, inf can be replaced with the largest finite
// float, if the ninf flag is set.
if (AF.isInfinity() || (Flags.hasNoInfs() && AF.isLargest())) {
- // minnum(X, -inf) -> -inf
- // maxnum(X, +inf) -> +inf
+ // minnum(X, -inf) -> -inf (ignoring sNaN -> qNaN propagation)
+ // maxnum(X, +inf) -> +inf (ignoring sNaN -> qNaN propagation)
// minimum(X, -inf) -> -inf if nnan
// maximum(X, +inf) -> +inf if nnan
- if (IsMin == AF.isNegative() && (!PropagatesNaN || Flags.hasNoNaNs()))
+ // minimumnum(X, -inf) -> -inf
+ // maximumnum(X, +inf) -> +inf
+ if (IsMin == AF.isNegative() &&
+ (!PropAllNaNsToQNaNs || Flags.hasNoNaNs()))
return N->getOperand(1);
// minnum(X, +inf) -> X if nnan
// maxnum(X, -inf) -> X if nnan
- // minimum(X, +inf) -> X
- // maximum(X, -inf) -> X
- if (IsMin != AF.isNegative() && (PropagatesNaN || Flags.hasNoNaNs()))
+ // minimum(X, +inf) -> X (ignoring quieting of sNaNs)
+ // maximum(X, -inf) -> X (ignoring quieting of sNaNs)
+ // minimumnum(X, +inf) -> X if nnan
+ // maximumnum(X, -inf) -> X if nnan
+ if (IsMin != AF.isNegative() && (PropAllNaNsToQNaNs || Flags.hasNoNaNs()))
return N->getOperand(0);
}
}
+ // There are no VECREDUCE variants of FMINIMUMNUM or FMAXIMUMNUM
+ if (Opc == ISD::FMINIMUMNUM || Opc == ISD::FMAXIMUMNUM)
+ return SDValue();
+
if (SDValue SD = reassociateReduction(
- PropagatesNaN
+ PropAllNaNsToQNaNs
? (IsMin ? ISD::VECREDUCE_FMINIMUM : ISD::VECREDUCE_FMAXIMUM)
: (IsMin ? ISD::VECREDUCE_FMIN : ISD::VECREDUCE_FMAX),
Opc, SDLoc(N), VT, N0, N1, Flags))
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 175753f..6c11c5b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -234,6 +234,19 @@ static bool dontUseFastISelFor(const Function &Fn) {
});
}
+static bool maintainPGOProfile(const TargetMachine &TM,
+ CodeGenOptLevel OptLevel) {
+ if (OptLevel != CodeGenOptLevel::None)
+ return true;
+ if (TM.getPGOOption()) {
+ const PGOOptions &Options = *TM.getPGOOption();
+ return Options.Action == PGOOptions::PGOAction::IRUse ||
+ Options.Action == PGOOptions::PGOAction::SampleUse ||
+ Options.CSAction == PGOOptions::CSPGOAction::CSIRUse;
+ }
+ return false;
+}
+
namespace llvm {
//===--------------------------------------------------------------------===//
@@ -395,6 +408,7 @@ SelectionDAGISel::~SelectionDAGISel() { delete CurDAG; }
void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
CodeGenOptLevel OptLevel = Selector->OptLevel;
+ bool RegisterPGOPasses = maintainPGOProfile(Selector->TM, Selector->OptLevel);
if (OptLevel != CodeGenOptLevel::None)
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<GCModuleInfo>();
@@ -403,15 +417,15 @@ void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<AssumptionCacheTracker>();
- if (UseMBPI && OptLevel != CodeGenOptLevel::None)
- AU.addRequired<BranchProbabilityInfoWrapperPass>();
+ if (UseMBPI && RegisterPGOPasses)
+ AU.addRequired<BranchProbabilityInfoWrapperPass>();
AU.addRequired<ProfileSummaryInfoWrapperPass>();
// AssignmentTrackingAnalysis only runs if assignment tracking is enabled for
// the module.
AU.addRequired<AssignmentTrackingAnalysis>();
AU.addPreserved<AssignmentTrackingAnalysis>();
- if (OptLevel != CodeGenOptLevel::None)
- LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+ if (RegisterPGOPasses)
+ LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -464,6 +478,7 @@ void SelectionDAGISel::initializeAnalysisResults(
(void)MatchFilterFuncName;
#endif
+ bool RegisterPGOPasses = maintainPGOProfile(TM, OptLevel);
TII = MF->getSubtarget().getInstrInfo();
TLI = MF->getSubtarget().getTargetLowering();
RegInfo = &MF->getRegInfo();
@@ -474,7 +489,7 @@ void SelectionDAGISel::initializeAnalysisResults(
auto *PSI = MAMP.getCachedResult<ProfileSummaryAnalysis>(*Fn.getParent());
BlockFrequencyInfo *BFI = nullptr;
FAM.getResult<BlockFrequencyAnalysis>(Fn);
- if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOptLevel::None)
+ if (PSI && PSI->hasProfileSummary() && RegisterPGOPasses)
BFI = &FAM.getResult<BlockFrequencyAnalysis>(Fn);
FunctionVarLocs const *FnVarLocs = nullptr;
@@ -492,7 +507,7 @@ void SelectionDAGISel::initializeAnalysisResults(
// into account). That's unfortunate but OK because it just means we won't
// ask for passes that have been required anyway.
- if (UseMBPI && OptLevel != CodeGenOptLevel::None)
+ if (UseMBPI && RegisterPGOPasses)
FuncInfo->BPI = &FAM.getResult<BranchProbabilityAnalysis>(Fn);
else
FuncInfo->BPI = nullptr;
@@ -518,6 +533,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
(void)MatchFilterFuncName;
#endif
+ bool RegisterPGOPasses = maintainPGOProfile(TM, OptLevel);
TII = MF->getSubtarget().getInstrInfo();
TLI = MF->getSubtarget().getTargetLowering();
RegInfo = &MF->getRegInfo();
@@ -528,7 +544,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
AC = &MFP.getAnalysis<AssumptionCacheTracker>().getAssumptionCache(Fn);
auto *PSI = &MFP.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
BlockFrequencyInfo *BFI = nullptr;
- if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOptLevel::None)
+ if (PSI && PSI->hasProfileSummary() && RegisterPGOPasses)
BFI = &MFP.getAnalysis<LazyBlockFrequencyInfoPass>().getBFI();
FunctionVarLocs const *FnVarLocs = nullptr;
@@ -549,7 +565,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
// into account). That's unfortunate but OK because it just means we won't
// ask for passes that have been required anyway.
- if (UseMBPI && OptLevel != CodeGenOptLevel::None)
+ if (UseMBPI && RegisterPGOPasses)
FuncInfo->BPI =
&MFP.getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
else
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 5980ee3..286ed03 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3623,7 +3623,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
// 1. Build a list of reduction variables.
// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
auto Size = ReductionInfos.size();
- Type *PtrTy = PointerType::getUnqual(Ctx);
+ Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
+ Type *FuncPtrTy =
+ Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
Type *RedArrayTy = ArrayType::get(PtrTy, Size);
CodeGenIP = Builder.saveIP();
Builder.restoreIP(AllocaIP);
@@ -3667,9 +3669,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
Builder.getInt64(MaxDataSize * ReductionInfos.size());
if (!IsTeamsReduction) {
Value *SarFuncCast =
- Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
+ Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, FuncPtrTy);
Value *WcFuncCast =
- Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
+ Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
WcFuncCast};
Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
@@ -10072,13 +10074,14 @@ void OpenMPIRBuilder::initializeTypes(Module &M) {
LLVMContext &Ctx = M.getContext();
StructType *T;
unsigned DefaultTargetAS = Config.getDefaultTargetAS();
+ unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
- VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
+ VarName##Ptr = PointerType::get(Ctx, ProgramAS);
#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
T = StructType::getTypeByName(Ctx, StructName); \
if (!T) \
diff --git a/llvm/lib/IR/ConstantFPRange.cpp b/llvm/lib/IR/ConstantFPRange.cpp
index 7509188..fba6942 100644
--- a/llvm/lib/IR/ConstantFPRange.cpp
+++ b/llvm/lib/IR/ConstantFPRange.cpp
@@ -391,3 +391,23 @@ ConstantFPRange ConstantFPRange::unionWith(const ConstantFPRange &CR) const {
return ConstantFPRange(minnum(Lower, CR.Lower), maxnum(Upper, CR.Upper),
MayBeQNaN | CR.MayBeQNaN, MayBeSNaN | CR.MayBeSNaN);
}
+
+ConstantFPRange ConstantFPRange::abs() const {
+ if (isNaNOnly())
+ return *this;
+ // Check if the range is all non-negative or all non-positive.
+ if (Lower.isNegative() == Upper.isNegative()) {
+ if (Lower.isNegative())
+ return negate();
+ return *this;
+ }
+ // The range contains both positive and negative values.
+ APFloat NewLower = APFloat::getZero(getSemantics());
+ APFloat NewUpper = maxnum(-Lower, Upper);
+ return ConstantFPRange(std::move(NewLower), std::move(NewUpper), MayBeQNaN,
+ MayBeSNaN);
+}
+
+ConstantFPRange ConstantFPRange::negate() const {
+ return ConstantFPRange(-Upper, -Lower, MayBeQNaN, MayBeSNaN);
+}
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 1ae20a9f..07a870f 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -715,11 +715,20 @@ DICompositeType *DIBuilder::createArrayType(
DICompositeType *DIBuilder::createVectorType(uint64_t Size,
uint32_t AlignInBits, DIType *Ty,
- DINodeArray Subscripts) {
- auto *R = DICompositeType::get(VMContext, dwarf::DW_TAG_array_type, "",
- nullptr, 0, nullptr, Ty, Size, AlignInBits, 0,
- DINode::FlagVector, Subscripts, 0,
- /*EnumKind=*/std::nullopt, nullptr);
+ DINodeArray Subscripts,
+ Metadata *BitStride) {
+ auto *R = DICompositeType::get(
+ VMContext, dwarf::DW_TAG_array_type, /*Name=*/"",
+ /*File=*/nullptr, /*Line=*/0, /*Scope=*/nullptr, /*BaseType=*/Ty,
+ /*SizeInBits=*/Size, /*AlignInBits=*/AlignInBits, /*OffsetInBits=*/0,
+ /*Flags=*/DINode::FlagVector, /*Elements=*/Subscripts,
+ /*RuntimeLang=*/0, /*EnumKind=*/std::nullopt, /*VTableHolder=*/nullptr,
+ /*TemplateParams=*/nullptr, /*Identifier=*/"",
+ /*Discriminator=*/nullptr, /*DataLocation=*/nullptr,
+ /*Associated=*/nullptr, /*Allocated=*/nullptr, /*Rank=*/nullptr,
+ /*Annotations=*/nullptr, /*Specification=*/nullptr,
+ /*NumExtraInhabitants=*/0,
+ /*BitStride=*/BitStride);
trackIfUnresolved(R);
return R;
}
diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index 4f37624..8e6d654 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -273,6 +273,13 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
C.print(OS);
}
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
+ BranchProbability P)
+ : Key(std::string(Key)) {
+ raw_string_ostream OS(Val);
+ P.print(OS);
+}
+
DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, DebugLoc Loc)
: Key(std::string(Key)), Loc(Loc) {
if (Loc) {
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 47860c0..708e79d 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -20,7 +20,7 @@ using namespace llvm::mustache;
namespace {
-using Accessor = SmallVector<std::string>;
+using Accessor = ArrayRef<StringRef>;
static bool isFalsey(const json::Value &V) {
return V.getAsNull() || (V.getAsBoolean() && !V.getAsBoolean().value()) ||
@@ -34,23 +34,32 @@ static bool isContextFalsey(const json::Value *V) {
return isFalsey(*V);
}
-static Accessor splitMustacheString(StringRef Str) {
+static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
// We split the mustache string into an accessor.
// For example:
// "a.b.c" would be split into {"a", "b", "c"}
// We make an exception for a single dot which
// refers to the current context.
- Accessor Tokens;
+ SmallVector<StringRef> Tokens;
if (Str == ".") {
- Tokens.emplace_back(Str);
- return Tokens;
- }
- while (!Str.empty()) {
- StringRef Part;
- std::tie(Part, Str) = Str.split(".");
- Tokens.emplace_back(Part.trim());
+ // "." is a special accessor that refers to the current context.
+ // It's a literal, so it doesn't need to be saved.
+ Tokens.push_back(".");
+ } else {
+ while (!Str.empty()) {
+ StringRef Part;
+ std::tie(Part, Str) = Str.split('.');
+ // Each part of the accessor needs to be saved to the arena
+ // to ensure it has a stable address.
+ Tokens.push_back(Ctx.Saver.save(Part.trim()));
+ }
}
- return Tokens;
+ // Now, allocate memory for the array of StringRefs in the arena.
+ StringRef *ArenaTokens = Ctx.Allocator.Allocate<StringRef>(Tokens.size());
+ // Copy the StringRefs from the stack vector to the arena.
+ std::copy(Tokens.begin(), Tokens.end(), ArenaTokens);
+ // Return an ArrayRef pointing to the stable arena memory.
+ return ArrayRef<StringRef>(ArenaTokens, Tokens.size());
}
} // namespace
@@ -97,23 +106,23 @@ public:
SetDelimiter,
};
- Token(std::string Str)
- : TokenType(Type::Text), RawBody(std::move(Str)), TokenBody(RawBody),
+ Token(StringRef Str)
+ : TokenType(Type::Text), RawBody(Str), TokenBody(RawBody),
AccessorValue({}), Indentation(0) {};
- Token(std::string RawBody, std::string TokenBody, char Identifier)
- : RawBody(std::move(RawBody)), TokenBody(std::move(TokenBody)),
- Indentation(0) {
+ Token(StringRef RawBody, StringRef TokenBody, char Identifier,
+ MustacheContext &Ctx)
+ : RawBody(RawBody), TokenBody(TokenBody), Indentation(0) {
TokenType = getTokenType(Identifier);
if (TokenType == Type::Comment)
return;
StringRef AccessorStr(this->TokenBody);
if (TokenType != Type::Variable)
AccessorStr = AccessorStr.substr(1);
- AccessorValue = splitMustacheString(StringRef(AccessorStr).trim());
+ AccessorValue = splitMustacheString(StringRef(AccessorStr).trim(), Ctx);
}
- Accessor getAccessor() const { return AccessorValue; }
+ ArrayRef<StringRef> getAccessor() const { return AccessorValue; }
Type getType() const { return TokenType; }
@@ -144,16 +153,16 @@ public:
Type TokenType;
// RawBody is the original string that was tokenized.
- std::string RawBody;
+ StringRef RawBody;
// TokenBody is the original string with the identifier removed.
- std::string TokenBody;
- Accessor AccessorValue;
+ StringRef TokenBody;
+ ArrayRef<StringRef> AccessorValue;
size_t Indentation;
};
using EscapeMap = DenseMap<char, std::string>;
-class ASTNode {
+class ASTNode : public ilist_node<ASTNode> {
public:
enum Type {
Root,
@@ -168,18 +177,19 @@ public:
ASTNode(MustacheContext &Ctx)
: Ctx(Ctx), Ty(Type::Root), Parent(nullptr), ParentContext(nullptr) {}
- ASTNode(MustacheContext &Ctx, std::string Body, ASTNode *Parent)
- : Ctx(Ctx), Ty(Type::Text), Body(std::move(Body)), Parent(Parent),
+ ASTNode(MustacheContext &Ctx, StringRef Body, ASTNode *Parent)
+ : Ctx(Ctx), Ty(Type::Text), Body(Body), Parent(Parent),
ParentContext(nullptr) {}
// Constructor for Section/InvertSection/Variable/UnescapeVariable Nodes
- ASTNode(MustacheContext &Ctx, Type Ty, Accessor Accessor, ASTNode *Parent)
- : Ctx(Ctx), Ty(Ty), Parent(Parent), AccessorValue(std::move(Accessor)),
+ ASTNode(MustacheContext &Ctx, Type Ty, ArrayRef<StringRef> Accessor,
+ ASTNode *Parent)
+ : Ctx(Ctx), Ty(Ty), Parent(Parent), AccessorValue(Accessor),
ParentContext(nullptr) {}
- void addChild(AstPtr Child) { Children.emplace_back(std::move(Child)); };
+ void addChild(AstPtr Child) { Children.push_back(Child); };
- void setRawBody(std::string NewBody) { RawBody = std::move(NewBody); };
+ void setRawBody(StringRef NewBody) { RawBody = NewBody; };
void setIndentation(size_t NewIndentation) { Indentation = NewIndentation; };
@@ -212,28 +222,27 @@ private:
MustacheContext &Ctx;
Type Ty;
size_t Indentation = 0;
- std::string RawBody;
- std::string Body;
+ StringRef RawBody;
+ StringRef Body;
ASTNode *Parent;
- // TODO: switch implementation to SmallVector<T>
- std::vector<AstPtr> Children;
- const Accessor AccessorValue;
+ ASTNodeList Children;
+ const ArrayRef<StringRef> AccessorValue;
const llvm::json::Value *ParentContext;
};
// A wrapper for arena allocator for ASTNodes
static AstPtr createRootNode(MustacheContext &Ctx) {
- return std::make_unique<ASTNode>(Ctx);
+ return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx);
}
-static AstPtr createNode(MustacheContext &Ctx, ASTNode::Type T, Accessor A,
- ASTNode *Parent) {
- return std::make_unique<ASTNode>(Ctx, T, std::move(A), Parent);
+static AstPtr createNode(MustacheContext &Ctx, ASTNode::Type T,
+ ArrayRef<StringRef> A, ASTNode *Parent) {
+ return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx, T, A, Parent);
}
-static AstPtr createTextNode(MustacheContext &Ctx, std::string Body,
+static AstPtr createTextNode(MustacheContext &Ctx, StringRef Body,
ASTNode *Parent) {
- return std::make_unique<ASTNode>(Ctx, std::move(Body), Parent);
+ return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx, Body, Parent);
}
// Function to check if there is meaningful text behind.
@@ -295,9 +304,9 @@ static void stripTokenAhead(SmallVectorImpl<Token> &Tokens, size_t Idx) {
StringRef NextTokenBody = NextToken.TokenBody;
// Cut off the leading newline which could be \n or \r\n.
if (NextTokenBody.starts_with("\r\n"))
- NextToken.TokenBody = NextTokenBody.substr(2).str();
+ NextToken.TokenBody = NextTokenBody.substr(2);
else if (NextTokenBody.starts_with("\n"))
- NextToken.TokenBody = NextTokenBody.substr(1).str();
+ NextToken.TokenBody = NextTokenBody.substr(1);
}
// Adjust previous token body if there no text behind.
@@ -312,7 +321,7 @@ void stripTokenBefore(SmallVectorImpl<Token> &Tokens, size_t Idx,
StringRef PrevTokenBody = PrevToken.TokenBody;
StringRef Unindented = PrevTokenBody.rtrim(" \r\t\v");
size_t Indentation = PrevTokenBody.size() - Unindented.size();
- PrevToken.TokenBody = Unindented.str();
+ PrevToken.TokenBody = Unindented;
CurrentToken.setIndentation(Indentation);
}
@@ -402,21 +411,20 @@ static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
}
static std::optional<std::pair<StringRef, StringRef>>
-processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
+processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) {
LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
<< ", Kind: " << tagKindToString(T.TagKind) << "\n");
if (T.TagKind == Tag::Kind::Triple) {
- Tokens.emplace_back(T.FullMatch.str(), "&" + T.Content.str(), '&');
+ Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx);
return std::nullopt;
}
StringRef Interpolated = T.Content;
- std::string RawBody = T.FullMatch.str();
if (!Interpolated.trim().starts_with("=")) {
char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
- Tokens.emplace_back(RawBody, Interpolated.str(), Front);
+ Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx);
return std::nullopt;
}
- Tokens.emplace_back(RawBody, Interpolated.str(), '=');
+ Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx);
StringRef DelimSpec = Interpolated.trim();
DelimSpec = DelimSpec.drop_front(1);
DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
@@ -432,7 +440,7 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
// The mustache spec allows {{{ }}} to unescape variables,
// but we don't support that here. An unescape variable
// is represented only by {{& variable}}.
-static SmallVector<Token> tokenize(StringRef Template) {
+static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) {
LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
SmallVector<Token> Tokens;
SmallString<8> Open("{{");
@@ -446,19 +454,17 @@ static SmallVector<Token> tokenize(StringRef Template) {
if (T.TagKind == Tag::Kind::None) {
// No more tags, the rest is text.
- Tokens.emplace_back(Template.substr(Start).str());
- LLVM_DEBUG(dbgs() << " No more tags. Created final Text token: \""
- << Template.substr(Start) << "\"\n");
+ Tokens.emplace_back(Template.substr(Start));
break;
}
// Add the text before the tag.
if (T.StartPosition > Start) {
StringRef Text = Template.substr(Start, T.StartPosition - Start);
- Tokens.emplace_back(Text.str());
+ Tokens.emplace_back(Text);
}
- if (auto NewDelims = processTag(T, Tokens)) {
+ if (auto NewDelims = processTag(T, Tokens, Ctx)) {
std::tie(Open, Close) = *NewDelims;
}
@@ -614,20 +620,20 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty,
const Accessor &A) {
AstPtr CurrentNode = createNode(Ctx, Ty, A, Parent);
size_t Start = CurrentPtr;
- parseMustache(CurrentNode.get());
+ parseMustache(CurrentNode);
const size_t End = CurrentPtr - 1;
- std::string RawBody;
+ SmallString<128> RawBody;
for (std::size_t I = Start; I < End; I++)
RawBody += Tokens[I].RawBody;
- CurrentNode->setRawBody(std::move(RawBody));
- Parent->addChild(std::move(CurrentNode));
+ CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody)));
+ Parent->addChild(CurrentNode);
}
AstPtr Parser::parse() {
- Tokens = tokenize(TemplateStr);
+ Tokens = tokenize(TemplateStr, Ctx);
CurrentPtr = 0;
AstPtr RootNode = createRootNode(Ctx);
- parseMustache(RootNode.get());
+ parseMustache(RootNode);
return RootNode;
}
@@ -636,31 +642,29 @@ void Parser::parseMustache(ASTNode *Parent) {
while (CurrentPtr < Tokens.size()) {
Token CurrentToken = Tokens[CurrentPtr];
CurrentPtr++;
- Accessor A = CurrentToken.getAccessor();
+ ArrayRef<StringRef> A = CurrentToken.getAccessor();
AstPtr CurrentNode;
switch (CurrentToken.getType()) {
case Token::Type::Text: {
- CurrentNode =
- createTextNode(Ctx, std::move(CurrentToken.TokenBody), Parent);
- Parent->addChild(std::move(CurrentNode));
+ CurrentNode = createTextNode(Ctx, CurrentToken.TokenBody, Parent);
+ Parent->addChild(CurrentNode);
break;
}
case Token::Type::Variable: {
- CurrentNode = createNode(Ctx, ASTNode::Variable, std::move(A), Parent);
- Parent->addChild(std::move(CurrentNode));
+ CurrentNode = createNode(Ctx, ASTNode::Variable, A, Parent);
+ Parent->addChild(CurrentNode);
break;
}
case Token::Type::UnescapeVariable: {
- CurrentNode =
- createNode(Ctx, ASTNode::UnescapeVariable, std::move(A), Parent);
- Parent->addChild(std::move(CurrentNode));
+ CurrentNode = createNode(Ctx, ASTNode::UnescapeVariable, A, Parent);
+ Parent->addChild(CurrentNode);
break;
}
case Token::Type::Partial: {
- CurrentNode = createNode(Ctx, ASTNode::Partial, std::move(A), Parent);
+ CurrentNode = createNode(Ctx, ASTNode::Partial, A, Parent);
CurrentNode->setIndentation(CurrentToken.getIndentation());
- Parent->addChild(std::move(CurrentNode));
+ Parent->addChild(CurrentNode);
break;
}
case Token::Type::SectionOpen: {
@@ -694,8 +698,7 @@ static void toMustacheString(const json::Value &Data, raw_ostream &OS) {
return;
}
case json::Value::String: {
- auto Str = *Data.getAsString();
- OS << Str.str();
+ OS << *Data.getAsString();
return;
}
@@ -727,7 +730,7 @@ void ASTNode::renderPartial(const json::Value &CurrentCtx,
<< ", Indentation:" << Indentation << "\n");
auto Partial = Ctx.Partials.find(AccessorValue[0]);
if (Partial != Ctx.Partials.end())
- renderPartial(CurrentCtx, OS, Partial->getValue().get());
+ renderPartial(CurrentCtx, OS, Partial->getValue());
}
void ASTNode::renderVariable(const json::Value &CurrentCtx,
@@ -858,8 +861,8 @@ const json::Value *ASTNode::findContext() {
void ASTNode::renderChild(const json::Value &Contexts,
MustacheOutputStream &OS) {
- for (AstPtr &Child : Children)
- Child->render(Contexts, OS);
+ for (ASTNode &Child : Children)
+ Child.render(Contexts, OS);
}
void ASTNode::renderPartial(const json::Value &Contexts,
@@ -869,7 +872,7 @@ void ASTNode::renderPartial(const json::Value &Contexts,
Partial->render(Contexts, IS);
}
-void ASTNode::renderLambdas(const json::Value &Contexts,
+void ASTNode::renderLambdas(const llvm::json::Value &Contexts,
MustacheOutputStream &OS, Lambda &L) {
json::Value LambdaResult = L();
std::string LambdaStr;
@@ -886,9 +889,9 @@ void ASTNode::renderLambdas(const json::Value &Contexts,
LambdaNode->render(Contexts, OS);
}
-void ASTNode::renderSectionLambdas(const json::Value &Contexts,
+void ASTNode::renderSectionLambdas(const llvm::json::Value &Contexts,
MustacheOutputStream &OS, SectionLambda &L) {
- json::Value Return = L(RawBody);
+ json::Value Return = L(RawBody.str());
if (isFalsey(Return))
return;
std::string LambdaStr;
@@ -899,15 +902,16 @@ void ASTNode::renderSectionLambdas(const json::Value &Contexts,
LambdaNode->render(Contexts, OS);
}
-void Template::render(const json::Value &Data, llvm::raw_ostream &OS) {
+void Template::render(const llvm::json::Value &Data, llvm::raw_ostream &OS) {
RawMustacheOutputStream MOS(OS);
Tree->render(Data, MOS);
}
void Template::registerPartial(std::string Name, std::string Partial) {
- Parser P(Partial, Ctx);
+ StringRef SavedPartial = Ctx.Saver.save(Partial);
+ Parser P(SavedPartial, Ctx);
AstPtr PartialTree = P.parse();
- Ctx.Partials.insert(std::make_pair(Name, std::move(PartialTree)));
+ Ctx.Partials.insert(std::make_pair(Name, PartialTree));
}
void Template::registerLambda(std::string Name, Lambda L) {
@@ -922,7 +926,7 @@ void Template::overrideEscapeCharacters(EscapeMap E) {
Ctx.Escapes = std::move(E);
}
-Template::Template(StringRef TemplateStr) {
+Template::Template(StringRef TemplateStr, MustacheContext &Ctx) : Ctx(Ctx) {
Parser P(TemplateStr, Ctx);
Tree = P.parse();
// The default behavior is to escape html entities.
@@ -935,18 +939,12 @@ Template::Template(StringRef TemplateStr) {
}
Template::Template(Template &&Other) noexcept
- : Ctx(std::move(Other.Ctx)), Tree(std::move(Other.Tree)) {}
+ : Ctx(Other.Ctx), Tree(Other.Tree) {
+ Other.Tree = nullptr;
+}
Template::~Template() = default;
-Template &Template::operator=(Template &&Other) noexcept {
- if (this != &Other) {
- Ctx = std::move(Other.Ctx);
- Tree = std::move(Other.Tree);
- Other.Tree = nullptr;
- }
- return *this;
-}
} // namespace llvm::mustache
#undef DEBUG_TYPE
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dc8e7c8..31b3d18 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1458,6 +1458,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
+ setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom);
setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
if (Subtarget->hasMatMulInt8()) {
@@ -30769,6 +30770,17 @@ AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
ResultVT.isFixedLengthVector() &&
useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
+ // We can handle this case natively by accumulating into a wider
+ // zero-padded vector.
+ if (!ConvertToScalable && ResultVT == MVT::v2i32 && OpVT == MVT::v16i8) {
+ SDValue ZeroVec = DAG.getConstant(0, DL, MVT::v4i32);
+ SDValue WideAcc = DAG.getInsertSubvector(DL, ZeroVec, Acc, 0);
+ SDValue Wide =
+ DAG.getNode(Op.getOpcode(), DL, MVT::v4i32, WideAcc, LHS, RHS);
+ SDValue Reduced = DAG.getNode(AArch64ISD::ADDP, DL, MVT::v4i32, Wide, Wide);
+ return DAG.getExtractSubvector(DL, MVT::v2i32, Reduced, 0);
+ }
+
if (ConvertToScalable) {
ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 91e64e6..bd0a17d 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -315,6 +315,8 @@ public:
}
void setStackSizeSVE(uint64_t ZPR, uint64_t PPR) {
+ assert(isAligned(Align(16), ZPR) && isAligned(Align(16), PPR) &&
+ "expected SVE stack sizes to be aligned to 16-bytes");
StackSizeZPR = ZPR;
StackSizePPR = PPR;
HasCalculatedStackSizeSVE = true;
@@ -425,6 +427,8 @@ public:
// Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes'
void setSVECalleeSavedStackSize(unsigned ZPR, unsigned PPR) {
+ assert(isAligned(Align(16), ZPR) && isAligned(Align(16), PPR) &&
+ "expected SVE callee-save sizes to be aligned to 16-bytes");
ZPRCalleeSavedStackSize = ZPR;
PPRCalleeSavedStackSize = PPR;
HasSVECalleeSavedStackSize = true;
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 1568161..f110558 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -60,7 +60,6 @@ static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) {
case AArch64::PTRUE_C_B:
return I->getFlag(MachineInstr::FrameSetup) ||
I->getFlag(MachineInstr::FrameDestroy);
- case AArch64::SEH_SavePReg:
case AArch64::SEH_SaveZReg:
return true;
}
@@ -75,6 +74,8 @@ static bool isPartOfPPRCalleeSaves(MachineBasicBlock::iterator I) {
case AArch64::LDR_PXI:
return I->getFlag(MachineInstr::FrameSetup) ||
I->getFlag(MachineInstr::FrameDestroy);
+ case AArch64::SEH_SavePReg:
+ return true;
}
}
@@ -94,6 +95,26 @@ AArch64PrologueEpilogueCommon::AArch64PrologueEpilogueCommon(
HasFP = AFL.hasFP(MF);
NeedsWinCFI = AFL.needsWinCFI(MF);
+
+ // Windows unwind can't represent the required stack adjustments if we have
+ // both SVE callee-saves and dynamic stack allocations, and the frame pointer
+ // is before the SVE spills. The allocation of the frame pointer must be the
+ // last instruction in the prologue so the unwinder can restore the stack
+ // pointer correctly. (And there isn't any unwind opcode for `addvl sp, x29,
+ // -17`.)
+ //
+ // Because of this, we do spills in the opposite order on Windows: first SVE,
+ // then GPRs. The main side-effect of this is that it makes accessing
+ // parameters passed on the stack more expensive.
+ //
+ // We could consider rearranging the spills for simpler cases.
+ if (Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize()) {
+ if (AFI->hasStackHazardSlotIndex())
+ reportFatalUsageError("SME hazard padding is not supported on Windows");
+ SVELayout = SVEStackLayout::CalleeSavesAboveFrameRecord;
+ } else if (AFI->hasSplitSVEObjects()) {
+ SVELayout = SVEStackLayout::Split;
+ }
}
MachineBasicBlock::iterator
@@ -334,6 +355,55 @@ bool AArch64PrologueEpilogueCommon::shouldCombineCSRLocalStackBump(
return true;
}
+SVEFrameSizes AArch64PrologueEpilogueCommon::getSVEStackFrameSizes() const {
+ StackOffset PPRCalleeSavesSize =
+ StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
+ StackOffset ZPRCalleeSavesSize =
+ StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
+ StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize;
+ StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize;
+ if (SVELayout == SVEStackLayout::Split)
+ return {{PPRCalleeSavesSize, PPRLocalsSize},
+ {ZPRCalleeSavesSize, ZPRLocalsSize}};
+ // For simplicity, attribute all locals to ZPRs when split SVE is disabled.
+ return {{PPRCalleeSavesSize, StackOffset{}},
+ {ZPRCalleeSavesSize, PPRLocalsSize + ZPRLocalsSize}};
+}
+
+struct SVEPartitions {
+ struct {
+ MachineBasicBlock::iterator Begin, End;
+ } PPR, ZPR;
+};
+
+static SVEPartitions partitionSVECS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ StackOffset PPRCalleeSavesSize,
+ StackOffset ZPRCalleeSavesSize,
+ bool IsEpilogue) {
+ MachineBasicBlock::iterator PPRsI = MBBI;
+ MachineBasicBlock::iterator End =
+ IsEpilogue ? MBB.begin() : MBB.getFirstTerminator();
+ auto AdjustI = [&](auto MBBI) { return IsEpilogue ? std::prev(MBBI) : MBBI; };
+ // Process the SVE CS to find the starts/ends of the ZPR and PPR areas.
+ if (PPRCalleeSavesSize) {
+ PPRsI = AdjustI(PPRsI);
+ assert(isPartOfPPRCalleeSaves(*PPRsI) && "Unexpected instruction");
+ while (PPRsI != End && isPartOfPPRCalleeSaves(AdjustI(PPRsI)))
+ IsEpilogue ? (--PPRsI) : (++PPRsI);
+ }
+ MachineBasicBlock::iterator ZPRsI = PPRsI;
+ if (ZPRCalleeSavesSize) {
+ ZPRsI = AdjustI(ZPRsI);
+ assert(isPartOfZPRCalleeSaves(*ZPRsI) && "Unexpected instruction");
+ while (ZPRsI != End && isPartOfZPRCalleeSaves(AdjustI(ZPRsI)))
+ IsEpilogue ? (--ZPRsI) : (++ZPRsI);
+ }
+ if (IsEpilogue)
+ return {{PPRsI, MBBI}, {ZPRsI, PPRsI}};
+ return {{MBBI, PPRsI}, {PPRsI, ZPRsI}};
+}
+
AArch64PrologueEmitter::AArch64PrologueEmitter(MachineFunction &MF,
MachineBasicBlock &MBB,
const AArch64FrameLowering &AFL)
@@ -613,30 +683,12 @@ void AArch64PrologueEmitter::emitPrologue() {
bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
unsigned FixedObject = AFL.getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
- // Windows unwind can't represent the required stack adjustments if we have
- // both SVE callee-saves and dynamic stack allocations, and the frame
- // pointer is before the SVE spills. The allocation of the frame pointer
- // must be the last instruction in the prologue so the unwinder can restore
- // the stack pointer correctly. (And there isn't any unwind opcode for
- // `addvl sp, x29, -17`.)
- //
- // Because of this, we do spills in the opposite order on Windows: first SVE,
- // then GPRs. The main side-effect of this is that it makes accessing
- // parameters passed on the stack more expensive.
- //
- // We could consider rearranging the spills for simpler cases.
- bool FPAfterSVECalleeSaves =
- Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
-
- if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex())
- reportFatalUsageError("SME hazard padding is not supported on Windows");
-
auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
// All of the remaining stack allocations are for locals.
determineLocalsStackSize(NumBytes, PrologueSaveSize);
MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI;
- if (FPAfterSVECalleeSaves) {
+ if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
// If we're doing SVE saves first, we need to immediately allocate space
// for fixed objects, then space for the SVE callee saves.
//
@@ -712,110 +764,66 @@ void AArch64PrologueEmitter::emitPrologue() {
if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding))
emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding);
- StackOffset PPRCalleeSavesSize =
- StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
- StackOffset ZPRCalleeSavesSize =
- StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
- StackOffset SVECalleeSavesSize = PPRCalleeSavesSize + ZPRCalleeSavesSize;
- StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize;
- StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize;
-
- std::optional<MachineBasicBlock::iterator> ZPRCalleeSavesBegin,
- ZPRCalleeSavesEnd, PPRCalleeSavesBegin, PPRCalleeSavesEnd;
-
+ auto [PPR, ZPR] = getSVEStackFrameSizes();
+ StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
+ StackOffset NonSVELocalsSize = StackOffset::getFixed(NumBytes);
StackOffset CFAOffset =
- StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
+ StackOffset::getFixed(MFI.getStackSize()) - NonSVELocalsSize;
+
MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI;
- if (!FPAfterSVECalleeSaves) {
- // Process the SVE callee-saves to find the starts/ends of the ZPR and PPR
- // areas.
- PPRCalleeSavesBegin = AfterGPRSavesI;
- if (PPRCalleeSavesSize) {
- LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = "
- << PPRCalleeSavesSize.getScalable() << "\n");
-
- assert(isPartOfPPRCalleeSaves(*PPRCalleeSavesBegin) &&
- "Unexpected instruction");
- while (isPartOfPPRCalleeSaves(AfterSVESavesI) &&
- AfterSVESavesI != MBB.getFirstTerminator())
- ++AfterSVESavesI;
+ // Allocate space for the callee saves and PPR locals (if any).
+ if (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord) {
+ auto [PPRRange, ZPRRange] =
+ partitionSVECS(MBB, AfterGPRSavesI, PPR.CalleeSavesSize,
+ ZPR.CalleeSavesSize, /*IsEpilogue=*/false);
+ AfterSVESavesI = ZPRRange.End;
+ if (EmitAsyncCFI)
+ emitCalleeSavedSVELocations(AfterSVESavesI);
+
+ StackOffset AllocateBeforePPRs = SVECalleeSavesSize;
+ StackOffset AllocateAfterPPRs = PPR.LocalsSize;
+ if (SVELayout == SVEStackLayout::Split) {
+ AllocateBeforePPRs = PPR.CalleeSavesSize;
+ AllocateAfterPPRs = PPR.LocalsSize + ZPR.CalleeSavesSize;
}
- PPRCalleeSavesEnd = ZPRCalleeSavesBegin = AfterSVESavesI;
- if (ZPRCalleeSavesSize) {
- LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = "
- << ZPRCalleeSavesSize.getScalable() << "\n");
- assert(isPartOfZPRCalleeSaves(*ZPRCalleeSavesBegin) &&
- "Unexpected instruction");
- while (isPartOfZPRCalleeSaves(AfterSVESavesI) &&
- AfterSVESavesI != MBB.getFirstTerminator())
- ++AfterSVESavesI;
- }
- ZPRCalleeSavesEnd = AfterSVESavesI;
- }
-
- if (EmitAsyncCFI)
- emitCalleeSavedSVELocations(AfterSVESavesI);
-
- if (AFI->hasSplitSVEObjects()) {
- assert(!FPAfterSVECalleeSaves &&
- "Cannot use FPAfterSVECalleeSaves with aarch64-split-sve-objects");
- assert(!AFL.canUseRedZone(MF) &&
- "Cannot use redzone with aarch64-split-sve-objects");
- // TODO: Handle HasWinCFI/NeedsWinCFI?
- assert(!NeedsWinCFI &&
- "WinCFI with aarch64-split-sve-objects is not supported");
-
- // Split ZPR and PPR allocation.
- // Allocate PPR callee saves
- allocateStackSpace(*PPRCalleeSavesBegin, 0, PPRCalleeSavesSize,
+ allocateStackSpace(PPRRange.Begin, 0, AllocateBeforePPRs,
EmitAsyncCFI && !HasFP, CFAOffset,
- MFI.hasVarSizedObjects() || ZPRCalleeSavesSize ||
- ZPRLocalsSize || PPRLocalsSize);
- CFAOffset += PPRCalleeSavesSize;
-
- // Allocate PPR locals + ZPR callee saves
- assert(PPRCalleeSavesEnd == ZPRCalleeSavesBegin &&
+ MFI.hasVarSizedObjects() || AllocateAfterPPRs ||
+ ZPR.LocalsSize || NonSVELocalsSize);
+ CFAOffset += AllocateBeforePPRs;
+ assert(PPRRange.End == ZPRRange.Begin &&
"Expected ZPR callee saves after PPR locals");
- allocateStackSpace(*PPRCalleeSavesEnd, RealignmentPadding,
- PPRLocalsSize + ZPRCalleeSavesSize,
- EmitAsyncCFI && !HasFP, CFAOffset,
- MFI.hasVarSizedObjects() || ZPRLocalsSize);
- CFAOffset += PPRLocalsSize + ZPRCalleeSavesSize;
-
- // Allocate ZPR locals
- allocateStackSpace(*ZPRCalleeSavesEnd, RealignmentPadding,
- ZPRLocalsSize + StackOffset::getFixed(NumBytes),
+ allocateStackSpace(PPRRange.End, RealignmentPadding, AllocateAfterPPRs,
EmitAsyncCFI && !HasFP, CFAOffset,
- MFI.hasVarSizedObjects());
+ MFI.hasVarSizedObjects() || ZPR.LocalsSize ||
+ NonSVELocalsSize);
+ CFAOffset += AllocateAfterPPRs;
} else {
- // Allocate space for the callee saves (if any).
- StackOffset LocalsSize =
- PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes);
- if (!FPAfterSVECalleeSaves)
- allocateStackSpace(AfterGPRSavesI, 0, SVECalleeSavesSize,
- EmitAsyncCFI && !HasFP, CFAOffset,
- MFI.hasVarSizedObjects() || LocalsSize);
+ assert(SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord);
+ // Note: With CalleeSavesAboveFrameRecord, the SVE CS have already been
+ // allocated (and separate PPR locals are not supported, all SVE locals,
+ // both PPR and ZPR, are within the ZPR locals area).
+ assert(!PPR.LocalsSize && "Unexpected PPR locals!");
CFAOffset += SVECalleeSavesSize;
+ }
- // Allocate space for the rest of the frame including SVE locals. Align the
- // stack as necessary.
- assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
- "Cannot use redzone with stack realignment");
- if (!AFL.canUseRedZone(MF)) {
- // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
- // the correct value here, as NumBytes also includes padding bytes,
- // which shouldn't be counted here.
- StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize;
- allocateStackSpace(AfterSVESavesI, RealignmentPadding,
- SVELocalsSize + StackOffset::getFixed(NumBytes),
- EmitAsyncCFI && !HasFP, CFAOffset,
- MFI.hasVarSizedObjects());
- }
+ // Allocate space for the rest of the frame including ZPR locals. Align the
+ // stack as necessary.
+ assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
+ "Cannot use redzone with stack realignment");
+ if (!AFL.canUseRedZone(MF)) {
+ // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have the
+ // correct value here, as NumBytes also includes padding bytes, which
+ // shouldn't be counted here.
+ allocateStackSpace(
+ AfterSVESavesI, RealignmentPadding, ZPR.LocalsSize + NonSVELocalsSize,
+ EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects());
}
// If we need a base pointer, set it up here. It's whatever the value of the
- // stack pointer is at this point. Any variable size objects will be allocated
- // after this, so we can still use the base pointer to reference locals.
+ // stack pointer is at this point. Any variable size objects will be
+ // allocated after this, so we can still use the base pointer to reference
+ // locals.
//
// FIXME: Clarify FrameSetup flags here.
// Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
@@ -1270,7 +1278,9 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
StackOffset::getScalable(MFI.getObjectOffset(FI)) -
StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
- if (AFI->hasSplitSVEObjects() &&
+ // The scalable vectors are below (lower address) the scalable predicates
+ // with split SVE objects, so we must subtract the size of the predicates.
+ if (SVELayout == SVEStackLayout::Split &&
MFI.getStackID(FI) == TargetStackID::ScalableVector)
Offset -= PPRStackSize;
@@ -1349,13 +1359,10 @@ void AArch64EpilogueEmitter::emitEpilogue() {
return;
}
- bool FPAfterSVECalleeSaves =
- Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
-
bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes);
// Assume we can't combine the last pop with the sp restore.
bool CombineAfterCSRBump = false;
- if (FPAfterSVECalleeSaves) {
+ if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
AfterCSRPopSize += FixedObject;
} else if (!CombineSPBump && PrologueSaveSize != 0) {
MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
@@ -1390,7 +1397,8 @@ void AArch64EpilogueEmitter::emitEpilogue() {
while (FirstGPRRestoreI != Begin) {
--FirstGPRRestoreI;
if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) ||
- (!FPAfterSVECalleeSaves && isPartOfSVECalleeSaves(FirstGPRRestoreI))) {
+ (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord &&
+ isPartOfSVECalleeSaves(FirstGPRRestoreI))) {
++FirstGPRRestoreI;
break;
} else if (CombineSPBump)
@@ -1414,13 +1422,9 @@ void AArch64EpilogueEmitter::emitEpilogue() {
if (HasFP && AFI->hasSwiftAsyncContext())
emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
- StackOffset ZPRStackSize = AFL.getZPRStackSize(MF);
- StackOffset PPRStackSize = AFL.getPPRStackSize(MF);
- StackOffset SVEStackSize = ZPRStackSize + PPRStackSize;
-
// If there is a single SP update, insert it before the ret and we're done.
if (CombineSPBump) {
- assert(!SVEStackSize && "Cannot combine SP bump with SVE");
+ assert(!AFI->hasSVEStackSize() && "Cannot combine SP bump with SVE");
// When we are about to restore the CSRs, the CFA register is SP again.
if (EmitCFI && HasFP)
@@ -1437,188 +1441,122 @@ void AArch64EpilogueEmitter::emitEpilogue() {
NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
- if (!AFI->hasSplitSVEObjects()) {
- // Process the SVE callee-saves to determine what space needs to be
- // deallocated.
- StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
- MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
- RestoreEnd = FirstGPRRestoreI;
- int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize();
- int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize();
- int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize;
-
- if (SVECalleeSavedSize) {
- if (FPAfterSVECalleeSaves)
- RestoreEnd = MBB.getFirstTerminator();
-
- RestoreBegin = std::prev(RestoreEnd);
- while (RestoreBegin != MBB.begin() &&
- isPartOfSVECalleeSaves(std::prev(RestoreBegin)))
- --RestoreBegin;
-
- assert(isPartOfSVECalleeSaves(RestoreBegin) &&
- isPartOfSVECalleeSaves(std::prev(RestoreEnd)) &&
- "Unexpected instruction");
-
- StackOffset CalleeSavedSizeAsOffset =
- StackOffset::getScalable(SVECalleeSavedSize);
- DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
- DeallocateAfter = CalleeSavedSizeAsOffset;
+ auto [PPR, ZPR] = getSVEStackFrameSizes();
+ auto [PPRRange, ZPRRange] = partitionSVECS(
+ MBB,
+ SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
+ ? MBB.getFirstTerminator()
+ : FirstGPRRestoreI,
+ PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
+
+ StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
+ StackOffset SVEStackSize =
+ SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
+ MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin;
+ MachineBasicBlock::iterator RestoreEnd = PPRRange.End;
+
+ // Deallocate the SVE area.
+ if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+ StackOffset SVELocalsSize = ZPR.LocalsSize + PPR.LocalsSize;
+ // If the callee-save area is before FP, restoring the FP implicitly
+ // deallocates non-callee-save SVE allocations. Otherwise, deallocate them
+ // explicitly.
+ if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
+ emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
+ SVELocalsSize, TII, MachineInstr::FrameDestroy, false,
+ NeedsWinCFI, &HasWinCFI);
}
- // Deallocate the SVE area.
- if (FPAfterSVECalleeSaves) {
- // If the callee-save area is before FP, restoring the FP implicitly
- // deallocates non-callee-save SVE allocations. Otherwise, deallocate
- // them explicitly.
- if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
- emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
- DeallocateBefore, TII, MachineInstr::FrameDestroy,
- false, NeedsWinCFI, &HasWinCFI);
- }
+ // Deallocate callee-save non-SVE registers.
+ emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
- // Deallocate callee-save non-SVE registers.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(AFI->getCalleeSavedStackSize()),
- TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
- &HasWinCFI);
-
- // Deallocate fixed objects.
- emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(FixedObject), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI,
- &HasWinCFI);
-
- // Deallocate callee-save SVE registers.
- emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
- DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI, &HasWinCFI);
- } else if (SVEStackSize) {
- int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
- // If we have stack realignment or variable-sized objects we must use the
- // FP to restore SVE callee saves (as there is an unknown amount of
- // data/padding between the SP and SVE CS area).
- Register BaseForSVEDealloc =
- (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
- : AArch64::SP;
- if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
- Register CalleeSaveBase = AArch64::FP;
- if (int64_t CalleeSaveBaseOffset =
- AFI->getCalleeSaveBaseToFrameRecordOffset()) {
- // If we have have an non-zero offset to the non-SVE CS base we need
- // to compute the base address by subtracting the offest in a
- // temporary register first (to avoid briefly deallocating the SVE
- // CS).
- CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
- &AArch64::GPR64RegClass);
- emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
- StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
- MachineInstr::FrameDestroy);
- }
- // The code below will deallocate the stack space space by moving the
- // SP to the start of the SVE callee-save area.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
- StackOffset::getScalable(-SVECalleeSavedSize), TII,
+ // Deallocate fixed objects.
+ emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(FixedObject), TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
+
+ // Deallocate callee-save SVE registers.
+ emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+ SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
+ NeedsWinCFI, &HasWinCFI);
+ } else if (AFI->hasSVEStackSize()) {
+ // If we have stack realignment or variable-sized objects we must use the FP
+ // to restore SVE callee saves (as there is an unknown amount of
+ // data/padding between the SP and SVE CS area).
+ Register BaseForSVEDealloc =
+ (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
+ : AArch64::SP;
+ if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) {
+ // TODO: Support stack realigment and variable-sized objects.
+ assert(
+ SVELayout != SVEStackLayout::Split &&
+ "unexpected stack realignment or variable sized objects with split "
+ "SVE stack objects");
+
+ Register CalleeSaveBase = AArch64::FP;
+ if (int64_t CalleeSaveBaseOffset =
+ AFI->getCalleeSaveBaseToFrameRecordOffset()) {
+ // If we have have an non-zero offset to the non-SVE CS base we need to
+ // compute the base address by subtracting the offest in a temporary
+ // register first (to avoid briefly deallocating the SVE CS).
+ CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
+ &AArch64::GPR64RegClass);
+ emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
+ StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
MachineInstr::FrameDestroy);
- } else if (BaseForSVEDealloc == AArch64::SP) {
- if (SVECalleeSavedSize) {
- // Deallocate the non-SVE locals first before we can deallocate (and
- // restore callee saves) from the SVE area.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(NumBytes), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI,
- &HasWinCFI, EmitCFI && !HasFP,
- SVEStackSize + StackOffset::getFixed(
- NumBytes + PrologueSaveSize));
- NumBytes = 0;
- }
-
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- DeallocateBefore, TII, MachineInstr::FrameDestroy,
- false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
- SVEStackSize +
- StackOffset::getFixed(NumBytes + PrologueSaveSize));
-
- emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
- DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
- DeallocateAfter +
- StackOffset::getFixed(NumBytes + PrologueSaveSize));
+ }
+ // The code below will deallocate the stack space space by moving the SP
+ // to the start of the SVE callee-save area.
+ emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
+ -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy);
+ } else if (BaseForSVEDealloc == AArch64::SP) {
+ auto CFAOffset =
+ SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
+
+ if (SVECalleeSavesSize) {
+ // Deallocate the non-SVE locals first before we can deallocate (and
+ // restore callee saves) from the SVE area.
+ auto NonSVELocals = StackOffset::getFixed(NumBytes);
+ emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+ NonSVELocals, TII, MachineInstr::FrameDestroy, false,
+ NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
+ CFAOffset -= NonSVELocals;
+ NumBytes = 0;
}
- if (EmitCFI)
- emitCalleeSavedSVERestores(RestoreEnd);
- }
- } else if (AFI->hasSplitSVEObjects() && SVEStackSize) {
- // TODO: Support stack realigment and variable-sized objects.
- assert(!AFI->isStackRealigned() && !MFI.hasVarSizedObjects() &&
- "unexpected stack realignment or variable sized objects with split "
- "SVE stack objects");
- // SplitSVEObjects. Determine the sizes and starts/ends of the ZPR and PPR
- // areas.
- auto ZPRCalleeSavedSize =
- StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
- auto PPRCalleeSavedSize =
- StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
- StackOffset PPRLocalsSize = PPRStackSize - PPRCalleeSavedSize;
- StackOffset ZPRLocalsSize = ZPRStackSize - ZPRCalleeSavedSize;
-
- MachineBasicBlock::iterator PPRRestoreBegin = FirstGPRRestoreI,
- PPRRestoreEnd = FirstGPRRestoreI;
- if (PPRCalleeSavedSize) {
- PPRRestoreBegin = std::prev(PPRRestoreEnd);
- while (PPRRestoreBegin != MBB.begin() &&
- isPartOfPPRCalleeSaves(std::prev(PPRRestoreBegin)))
- --PPRRestoreBegin;
- }
-
- MachineBasicBlock::iterator ZPRRestoreBegin = PPRRestoreBegin,
- ZPRRestoreEnd = PPRRestoreBegin;
- if (ZPRCalleeSavedSize) {
- ZPRRestoreBegin = std::prev(ZPRRestoreEnd);
- while (ZPRRestoreBegin != MBB.begin() &&
- isPartOfZPRCalleeSaves(std::prev(ZPRRestoreBegin)))
- --ZPRRestoreBegin;
- }
-
- auto CFAOffset =
- SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
- if (PPRCalleeSavedSize || ZPRCalleeSavedSize) {
- // Deallocate the non-SVE locals first before we can deallocate (and
- // restore callee saves) from the SVE area.
- auto NonSVELocals = StackOffset::getFixed(NumBytes);
- emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
- NonSVELocals, TII, MachineInstr::FrameDestroy, false,
- false, nullptr, EmitCFI && !HasFP, CFAOffset);
- NumBytes = 0;
- CFAOffset -= NonSVELocals;
- }
+ if (ZPR.LocalsSize) {
+ emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+ ZPR.LocalsSize, TII, MachineInstr::FrameDestroy, false,
+ NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
+ CFAOffset -= ZPR.LocalsSize;
+ }
- if (ZPRLocalsSize) {
- emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
- ZPRLocalsSize, TII, MachineInstr::FrameDestroy, false,
- false, nullptr, EmitCFI && !HasFP, CFAOffset);
- CFAOffset -= ZPRLocalsSize;
- }
+ StackOffset SVECalleeSavesToDealloc = SVECalleeSavesSize;
+ if (SVELayout == SVEStackLayout::Split &&
+ (PPR.LocalsSize || ZPR.CalleeSavesSize)) {
+ assert(PPRRange.Begin == ZPRRange.End &&
+ "Expected PPR restores after ZPR");
+ emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+ PPR.LocalsSize + ZPR.CalleeSavesSize, TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI,
+ &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
+ CFAOffset -= PPR.LocalsSize + ZPR.CalleeSavesSize;
+ SVECalleeSavesToDealloc -= ZPR.CalleeSavesSize;
+ }
- if (PPRLocalsSize || ZPRCalleeSavedSize) {
- assert(PPRRestoreBegin == ZPRRestoreEnd &&
- "Expected PPR restores after ZPR");
- emitFrameOffset(MBB, PPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
- PPRLocalsSize + ZPRCalleeSavedSize, TII,
- MachineInstr::FrameDestroy, false, false, nullptr,
- EmitCFI && !HasFP, CFAOffset);
- CFAOffset -= PPRLocalsSize + ZPRCalleeSavedSize;
- }
- if (PPRCalleeSavedSize) {
- emitFrameOffset(MBB, PPRRestoreEnd, DL, AArch64::SP, AArch64::SP,
- PPRCalleeSavedSize, TII, MachineInstr::FrameDestroy,
- false, false, nullptr, EmitCFI && !HasFP, CFAOffset);
+ // If split SVE is on, this dealloc PPRs, otherwise, deallocs ZPRs + PPRs:
+ if (SVECalleeSavesToDealloc)
+ emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP,
+ SVECalleeSavesToDealloc, TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI,
+ &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
}
- // We only emit CFI information for ZPRs so emit CFI after the ZPR restores.
if (EmitCFI)
- emitCalleeSavedSVERestores(ZPRRestoreEnd);
+ emitCalleeSavedSVERestores(
+ SVELayout == SVEStackLayout::Split ? ZPRRange.End : PPRRange.End);
}
if (!HasFP) {
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
index a1c9b34..bccadda 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
@@ -27,11 +27,23 @@ class AArch64Subtarget;
class AArch64FunctionInfo;
class AArch64FrameLowering;
+struct SVEFrameSizes {
+ struct {
+ StackOffset CalleeSavesSize, LocalsSize;
+ } PPR, ZPR;
+};
+
class AArch64PrologueEpilogueCommon {
public:
AArch64PrologueEpilogueCommon(MachineFunction &MF, MachineBasicBlock &MBB,
const AArch64FrameLowering &AFL);
+ enum class SVEStackLayout {
+ Default,
+ Split,
+ CalleeSavesAboveFrameRecord,
+ };
+
protected:
bool requiresGetVGCall() const;
@@ -53,6 +65,8 @@ protected:
bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const;
+ SVEFrameSizes getSVEStackFrameSizes() const;
+
MachineFunction &MF;
MachineBasicBlock &MBB;
@@ -68,6 +82,7 @@ protected:
bool IsFunclet = false; // Note: Set in derived constructors.
bool NeedsWinCFI = false; // Note: Can be changed in emitFramePointerSetup.
bool HomPrologEpilog = false; // Note: Set in derived constructors.
+ SVEStackLayout SVELayout = SVEStackLayout::Default;
// Note: "HasWinCFI" is mutable as it can change in any "emit" function.
mutable bool HasWinCFI = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 557d87f..56807a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5053,16 +5053,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
//
// vdst, srcA, srcB, srcC
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ bool UseAGPRForm = !Subtarget.hasGFX90AInsts() ||
+ Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
OpdsMapping[0] =
- Info->getMinNumAGPRs() >= MinNumRegsRequired
- ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
- : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
OpdsMapping[4] =
- Info->getMinNumAGPRs() >= MinNumRegsRequired
- ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
- : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
@@ -5115,11 +5117,21 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {
+ Register DstReg = MI.getOperand(0).getReg();
+ unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+ unsigned MinNumRegsRequired = DstSize / 32;
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
// vdst, srcA, srcB, srcC, idx
- OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[0] = UseAGPRForm ? getAGPROpMapping(DstReg, MRI, *TRI)
+ : getVGPROpMapping(DstReg, MRI, *TRI);
+
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
- OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ OpdsMapping[4] =
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index b7dbb59..2c1a13c 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1202,6 +1202,12 @@ public:
unsigned getMinNumAGPRs() const { return MinNumAGPRs; }
+ /// Return true if an MFMA that requires at least \p NumRegs should select to
+ /// the AGPR form, instead of the VGPR form.
+ bool selectAGPRFormMFMA(unsigned NumRegs) const {
+ return !MFMAVGPRForm && getMinNumAGPRs() >= NumRegs;
+ }
+
// \returns true if a function has a use of AGPRs via inline asm or
// has a call which may use it.
bool mayUseAGPRs(const Function &F) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 7cfd059..6500fce 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -964,14 +964,12 @@ class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : Pa
class CanUseAGPR_MAI<ValueType vt> {
code PredicateCode = [{
return !Subtarget->hasGFX90AInsts() ||
- (!SIMachineFunctionInfo::MFMAVGPRForm &&
- MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
- }] # !srl(vt.Size, 5) # ");";
+ MF->getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA(
+ }] # !srl(vt.Size, 5) # ");";
code GISelPredicateCode = [{
return !Subtarget->hasGFX90AInsts() ||
- (!SIMachineFunctionInfo::MFMAVGPRForm &&
- MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
+ MF.getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA(
}] # !srl(vt.Size, 5) # ");";
}
diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td
index 6d0529f..fb0928b8 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/llvm/lib/Target/Hexagon/Hexagon.td
@@ -110,8 +110,6 @@ def FeatureSmallData: SubtargetFeature<"small-data", "UseSmallData", "true",
"Allow GP-relative addressing of global variables">;
def FeatureDuplex: SubtargetFeature<"duplex", "EnableDuplex", "true",
"Enable generation of duplex instruction">;
-def FeatureUnsafeFP: SubtargetFeature<"unsafe-fp", "UseUnsafeMath", "true",
- "Use unsafe FP math">;
def FeatureReservedR19: SubtargetFeature<"reserved-r19", "ReservedR19",
"true", "Reserve register R19">;
def FeatureNoreturnStackElim: SubtargetFeature<"noreturn-stack-elim",
@@ -167,7 +165,6 @@ def UseHVXQFloat : Predicate<"HST->useHVXQFloatOps()">,
def UseHVXFloatingPoint: Predicate<"HST->useHVXFloatingPoint()">;
def HasMemNoShuf : Predicate<"HST->hasMemNoShuf()">,
AssemblerPredicate<(all_of FeatureMemNoShuf)>;
-def UseUnsafeMath : Predicate<"HST->useUnsafeMath()">;
def NotOptTinyCore : Predicate<"!HST->isTinyCore() ||"
"MF->getFunction().hasOptSize()"> {
let RecomputePerFunction = 1;
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 4b23670..85ce944 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -699,35 +699,20 @@ def: OpR_RR_pat<C2_cmpgtp, setgt, i1, I64>;
def: OpR_RR_pat<C2_cmpgtup, setugt, i1, I64>;
def: OpR_RR_pat<C2_cmpgtp, RevCmp<setlt>, i1, I64>;
def: OpR_RR_pat<C2_cmpgtup, RevCmp<setult>, i1, I64>;
-def: OpR_RR_pat<A2_vcmpbeq, seteq, i1, V8I8>;
def: OpR_RR_pat<A2_vcmpbeq, seteq, v8i1, V8I8>;
-def: OpR_RR_pat<A4_vcmpbgt, RevCmp<setlt>, i1, V8I8>;
def: OpR_RR_pat<A4_vcmpbgt, RevCmp<setlt>, v8i1, V8I8>;
-def: OpR_RR_pat<A4_vcmpbgt, setgt, i1, V8I8>;
def: OpR_RR_pat<A4_vcmpbgt, setgt, v8i1, V8I8>;
-def: OpR_RR_pat<A2_vcmpbgtu, RevCmp<setult>, i1, V8I8>;
def: OpR_RR_pat<A2_vcmpbgtu, RevCmp<setult>, v8i1, V8I8>;
-def: OpR_RR_pat<A2_vcmpbgtu, setugt, i1, V8I8>;
def: OpR_RR_pat<A2_vcmpbgtu, setugt, v8i1, V8I8>;
-def: OpR_RR_pat<A2_vcmpheq, seteq, i1, V4I16>;
def: OpR_RR_pat<A2_vcmpheq, seteq, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgt, RevCmp<setlt>, i1, V4I16>;
def: OpR_RR_pat<A2_vcmphgt, RevCmp<setlt>, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgt, setgt, i1, V4I16>;
def: OpR_RR_pat<A2_vcmphgt, setgt, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgtu, RevCmp<setult>, i1, V4I16>;
def: OpR_RR_pat<A2_vcmphgtu, RevCmp<setult>, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgtu, setugt, i1, V4I16>;
def: OpR_RR_pat<A2_vcmphgtu, setugt, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmpweq, seteq, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpweq, seteq, v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgt, RevCmp<setlt>, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgt, RevCmp<setlt>, v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgt, setgt, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgt, setgt, v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgtu, RevCmp<setult>, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgtu, RevCmp<setult>, v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgtu, setugt, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgtu, setugt, v2i1, V2I32>;
def: OpR_RR_pat<F2_sfcmpeq, seteq, i1, F32>;
@@ -1213,12 +1198,6 @@ def: OpR_RI_pat<S2_asl_i_r, Shl, i32, I32, u5_0ImmPred>;
def: OpR_RI_pat<S2_asr_i_p, Sra, i64, I64, u6_0ImmPred>;
def: OpR_RI_pat<S2_lsr_i_p, Srl, i64, I64, u6_0ImmPred>;
def: OpR_RI_pat<S2_asl_i_p, Shl, i64, I64, u6_0ImmPred>;
-def: OpR_RI_pat<S2_asr_i_vh, Sra, v4i16, V4I16, u4_0ImmPred>;
-def: OpR_RI_pat<S2_lsr_i_vh, Srl, v4i16, V4I16, u4_0ImmPred>;
-def: OpR_RI_pat<S2_asl_i_vh, Shl, v4i16, V4I16, u4_0ImmPred>;
-def: OpR_RI_pat<S2_asr_i_vh, Sra, v2i32, V2I32, u5_0ImmPred>;
-def: OpR_RI_pat<S2_lsr_i_vh, Srl, v2i32, V2I32, u5_0ImmPred>;
-def: OpR_RI_pat<S2_asl_i_vh, Shl, v2i32, V2I32, u5_0ImmPred>;
def: OpR_RR_pat<S2_asr_r_r, Sra, i32, I32, I32>;
def: OpR_RR_pat<S2_lsr_r_r, Srl, i32, I32, I32>;
@@ -1611,8 +1590,11 @@ def DfMpy: OutPatFrag<(ops node:$Rs, node:$Rt),
$Rt, $Rs),
$Rs, $Rt)>;
-let Predicates = [HasV67,UseUnsafeMath], AddedComplexity = 50 in {
- def: Pat<(fmul F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>;
+def fmul_afn : PatFrag<(ops node:$a, node:$b), (fmul node:$a, node:$b), [{
+ return N->getFlags().hasApproximateFuncs();
+}]>;
+let Predicates = [HasV67], AddedComplexity = 50 in {
+ def : Pat<(fmul_afn F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>;
}
let Predicates = [HasV67] in {
def: OpR_RR_pat<F2_dfmin, pf2<fminimumnum>, f64, F64>;
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index b111471..7430567 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -54,7 +54,6 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
bool UseNewValueJumps = false;
bool UseNewValueStores = false;
bool UseSmallData = false;
- bool UseUnsafeMath = false;
bool UseZRegOps = false;
bool UseHVXIEEEFPOps = false;
bool UseHVXQFloatOps = false;
@@ -234,7 +233,6 @@ public:
bool useNewValueJumps() const { return UseNewValueJumps; }
bool useNewValueStores() const { return UseNewValueStores; }
bool useSmallData() const { return UseSmallData; }
- bool useUnsafeMath() const { return UseUnsafeMath; }
bool useZRegOps() const { return UseZRegOps; }
bool useCabac() const { return UseCabac; }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 0afa04a..f5d8b69 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -250,13 +250,6 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
std::string FS =
FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
- // Append the preexisting target features last, so that +mattr overrides
- // the "unsafe-fp-math" function attribute.
- // Creating a separate target feature is not strictly necessary, it only
- // exists to make "unsafe-fp-math" force creating a new subtarget.
-
- if (F.getFnAttribute("unsafe-fp-math").getValueAsBool())
- FS = FS.empty() ? "+unsafe-fp" : "+unsafe-fp," + FS;
auto &I = SubtargetMap[CPU + FS];
if (!I) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bc047a4a..a1fb665 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -651,7 +651,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// Custom conversions to/from v2i8.
setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
- // Only logical ops can be done on v4i8 directly, others must be done
+ // Only logical ops can be done on v4i8/v2i32 directly, others must be done
// elementwise.
setOperationAction(
{ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE,
@@ -669,7 +669,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM,
ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT,
ISD::USUBSAT},
- MVT::v4i8, Expand);
+ {MVT::v4i8, MVT::v2i32}, Expand);
// Operations not directly supported by NVPTX.
for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
@@ -689,7 +689,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v2i32}, Expand);
setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);
setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 40c05e8..333b693 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1823,6 +1823,11 @@ def TuneConditionalCompressedMoveFusion
def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()">;
def NoConditionalMoveFusion : Predicate<"!Subtarget->hasConditionalMoveFusion()">;
+def TuneHasSingleElementVecFP64
+ : SubtargetFeature<"single-element-vec-fp64", "HasSingleElementVectorFP64", "true",
+ "Certain vector FP64 operations produce a single result "
+ "element per cycle">;
+
def TuneMIPSP8700
: SubtargetFeature<"mips-p8700", "RISCVProcFamily", "MIPSP8700",
"MIPS p8700 processor">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 447f05c..f2724c41 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1636,7 +1636,7 @@ def : QCISELECTCCIPat<SETNE, QC_SELECTNEI>;
}
let Predicates = [HasVendorXqcilsm, IsRV32] in {
-def : Pat<(qc_setwmi GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7),
+def : Pat<(qc_setwmi (i32 GPR:$rs3), GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7),
(QC_SETWMI GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7)>;
} // Predicates = [HasVendorXqcilsm, IsRV32]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index a29b7dd..57fbaa0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -634,56 +634,56 @@ def : PatGpr<bswap, REV8_RV64, i64>;
let Predicates = [HasStdExtZbkb] in {
def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFFFF),
- (zexti8 (XLenVT GPR:$rs1))),
- (PACKH GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 8)),
- (zexti8 (XLenVT GPR:$rs1))),
- (PACKH GPR:$rs1, GPR:$rs2)>;
+ zexti8:$rs1),
+ (PACKH zexti8:$rs1, GPR:$rs2)>;
+def : Pat<(or (shl zexti8:$rs2, (XLenVT 8)),
+ zexti8:$rs1),
+ (PACKH zexti8:$rs1, zexti8:$rs2)>;
def : Pat<(and (or (shl GPR:$rs2, (XLenVT 8)),
- (zexti8 (XLenVT GPR:$rs1))), 0xFFFF),
- (PACKH GPR:$rs1, GPR:$rs2)>;
+ zexti8:$rs1), 0xFFFF),
+ (PACKH zexti8:$rs1, GPR:$rs2)>;
def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)),
- (zexti8 (XLenVT GPR:$rs1))),
- (PACKH GPR:$rs1, GPR:$rs2)>;
+ zexti8:$rs1),
+ (PACKH zexti8:$rs1, GPR:$rs2)>;
} // Predicates = [HasStdExtZbkb]
let Predicates = [HasStdExtZbkb, IsRV32] in {
-def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))),
- (PACK GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i32 (or zexti16:$rs1, (shl GPR:$rs2, (i32 16)))),
+ (PACK zexti16:$rs1, GPR:$rs2)>;
-def : Pat<(or (shl GPR:$rs2, (XLenVT 24)),
- (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
- (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+def : Pat<(i32 (or (shl GPR:$rs2, (XLenVT 24)),
+ (shl zexti8:$rs1, (XLenVT 16)))),
+ (SLLI (XLenVT (PACKH zexti8:$rs1, GPR:$rs2)), (XLenVT 16))>;
// Match a pattern of 2 bytes being inserted into bits [31:16], with bits
// bits [15:0] coming from a zero extended value. We can use pack with packh for
// bits [31:16]. If bits [15:0] can also be a packh, it can be matched
// separately.
-def : Pat<(or (or (shl GPR:$op1rs2, (XLenVT 24)),
- (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
- (zexti16 (XLenVT GPR:$rs1))),
- (PACK (XLenVT GPR:$rs1),
- (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+def : Pat<(i32 (or (or (shl GPR:$op1rs2, (XLenVT 24)),
+ (shl zexti8:$op1rs1, (XLenVT 16))),
+ zexti16:$rs1)),
+ (PACK zexti16:$rs1,
+ (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
}
let Predicates = [HasStdExtZbkb, IsRV64] in {
-def : Pat<(i64 (or (zexti32 (i64 GPR:$rs1)), (shl GPR:$rs2, (i64 32)))),
- (PACK GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i64 (or zexti32:$rs1, (shl GPR:$rs2, (i64 32)))),
+ (PACK zexti32:$rs1, GPR:$rs2)>;
-def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)),
- (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
- (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+def : Pat<(i64 (or (shl zexti8:$rs2, (XLenVT 24)),
+ (shl zexti8:$rs1, (XLenVT 16)))),
+ (SLLI (XLenVT (PACKH zexti8:$rs1, zexti8:$rs2)), (XLenVT 16))>;
def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (XLenVT 24)),
- (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
- (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+ (shl zexti8:$rs1, (XLenVT 16))),
+ (SLLI (XLenVT (PACKH zexti8:$rs1, GPR:$rs2)), (XLenVT 16))>;
def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (i64 16)),
- (zexti16 (i64 GPR:$rs1))),
- (PACKW GPR:$rs1, GPR:$rs2)>;
+ zexti16:$rs1),
+ (PACKW zexti16:$rs1, GPR:$rs2)>;
def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
- (zexti16 (i64 GPR:$rs1)))),
- (PACKW GPR:$rs1, GPR:$rs2)>;
+ zexti16:$rs1)),
+ (PACKW zexti16:$rs1, GPR:$rs2)>;
// Match a pattern of 2 bytes being inserted into bits [31:16], with bits
// bits [15:0] coming from a zero extended value, and bits [63:32] being
@@ -691,35 +691,35 @@ def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
// also be a packh, it can be matched separately.
def : Pat<(binop_allwusers<or>
(or (shl GPR:$op1rs2, (XLenVT 24)),
- (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
- (zexti16 (XLenVT GPR:$rs1))),
- (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+ (shl zexti8:$op1rs1, (XLenVT 16))),
+ zexti16:$rs1),
+ (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
// We need to manually reassociate the patterns because of the binop_allwusers.
def : Pat<(binop_allwusers<or>
- (or (zexti16 (XLenVT GPR:$rs1)),
- (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
+ (or zexti16:$rs1,
+ (shl zexti8:$op1rs1, (XLenVT 16))),
(shl GPR:$op1rs2, (XLenVT 24))),
- (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+ (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
def : Pat<(binop_allwusers<or>
- (or (zexti16 (XLenVT GPR:$rs1)),
- (shl GPR:$op1rs1, (XLenVT 24))),
- (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
- (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+ (or zexti16:$rs1,
+ (shl GPR:$op1rs2, (XLenVT 24))),
+ (shl zexti8:$op1rs1, (XLenVT 16))),
+ (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
def : Pat<(i64 (or (or (zexti16 (XLenVT GPR:$rs1)),
- (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
- (sext_inreg (shl GPR:$op1rs1, (XLenVT 24)), i32))),
- (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+ (shl zexti8:$op1rs1, (XLenVT 16))),
+ (sext_inreg (shl GPR:$op1rs2, (XLenVT 24)), i32))),
+ (PACKW GPR:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
// Match a pattern of 2 halfwords being inserted into bits [63:32], with bits
// bits [31:0] coming from a zero extended value. We can use pack with packw for
// bits [63:32]. If bits [63:31] can also be a packw, it can be matched
// separately.
def : Pat<(or (or (shl GPR:$op1rs2, (i64 48)),
- (shl (zexti16 (i64 GPR:$op1rs1)), (i64 32))),
- (zexti32 (i64 GPR:$rs1))),
- (PACK (XLenVT GPR:$rs1),
- (XLenVT (PACKW GPR:$op1rs1, GPR:$op1rs2)))>;
+ (shl zexti16:$op1rs1, (i64 32))),
+ zexti32:$rs1),
+ (PACK zexti32:$rs1,
+ (XLenVT (PACKW zexti16:$op1rs1, GPR:$op1rs2)))>;
} // Predicates = [HasStdExtZbkb, IsRV64]
let Predicates = [HasStdExtZbb, IsRV32] in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 6d86aff..3658817 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -14,6 +14,10 @@
// otherwise.
def VLDSX0Pred : MCSchedPredicate<CheckRegOperand<3, X0>>;
+// This scheduling predicate is true when subtarget feature TuneHasSingleElementVecFP64
+// is enabled.
+def SingleElementVecFP64SchedPred : FeatureSchedPredicate<TuneHasSingleElementVecFP64>;
+
// Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
def isSEXT_W
: TIIPredicate<"isSEXT_W",
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 17a7948..e86431f 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -338,7 +338,8 @@ def SIFIVE_X390 : RISCVProcessorModel<"sifive-x390",
FeatureStdExtZvl1024b,
FeatureVendorXSiFivecdiscarddlone,
FeatureVendorXSiFivecflushdlone],
- SiFiveIntelligenceTuneFeatures>;
+ !listconcat(SiFiveIntelligenceTuneFeatures,
+ [TuneHasSingleElementVecFP64])>;
defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll,
TuneConditionalCompressedMoveFusion,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 3e07eff..f863392a 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -317,7 +317,6 @@ multiclass SiFive7WriteResBase<int VLEN,
ProcResourceKind VL, ProcResourceKind VS,
ProcResourceKind VCQ,
SiFive7FPLatencies fpLatencies,
- bit isFP64Throttled = false,
bit hasFastGather = false> {
// Branching
@@ -832,29 +831,56 @@ multiclass SiFive7WriteResBase<int VLEN,
// 13. Vector Floating-Point Instructions
foreach mx = SchedMxListF in {
foreach sew = SchedSEWSet<mx, isF=1>.val in {
- defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 64)),
- SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
- SiFive7GetCyclesDefault<mx>.c);
- defvar Lat8 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 8);
- defvar VA = !if(!and(isFP64Throttled, !eq(sew, 64)), VA1, VA1OrVA2);
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
- let Latency = Lat8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
- defm : LMULSEWWriteResMXSEW<"WriteVFALUV", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFALUF", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMulV", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMulF", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFRecpV", [VCQ, VA1], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
- }
- defvar Lat4 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 4);
- let Latency = Lat4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
- defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [VCQ, VA], mx, sew, IsWorstCase>;
- // min max require merge
- defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>;
+ if !eq(sew, 64) then {
+ defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ foreach SchedWriteName = ["WriteVFALUV", "WriteVFALUF", "WriteVFMulV", "WriteVFMulF",
+ "WriteVFMulAddV", "WriteVFMulAddF"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1OrVA2], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+ mx, sew, IsWorstCase>;
+ foreach SchedWriteName = ["WriteVFRecpV", "WriteVFCvtIToFV"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+ mx, sew, IsWorstCase>;
+ foreach SchedWriteName = ["WriteVFSgnjV", "WriteVFSgnjF"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1OrVA2], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+ mx, sew, IsWorstCase>;
+ foreach SchedWriteName = ["WriteVFMinMaxV", "WriteVFMinMaxF"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+ mx, sew, IsWorstCase>;
+ } else {
+ let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in {
+ defm : LMULSEWWriteResMXSEW<"WriteVFALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMulF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFRecpV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
+ let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in {
+ defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ // min max require merge
+ defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
}
}
}
@@ -892,19 +918,28 @@ multiclass SiFive7WriteResBase<int VLEN,
// Widening
foreach mx = SchedMxListW in {
foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
- defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
- SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
- SiFive7GetCyclesDefault<mx>.c);
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
- let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
- defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c;
+ if !eq(sew, 32) then {
+ defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtIToFV", SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+ mx, sew, IsWorstCase>;
+ } else {
+ let Latency = 8,
+ AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in
+ defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
}
}
foreach mx = SchedMxListFW in {
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
- defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+ defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c;
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
- let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+ let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in {
defm : LMULSEWWriteResMXSEW<"WriteVFWALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
defm : LMULSEWWriteResMXSEW<"WriteVFWALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
defm : LMULSEWWriteResMXSEW<"WriteVFWMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
@@ -912,11 +947,19 @@ multiclass SiFive7WriteResBase<int VLEN,
defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
}
- defvar CvtCycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
- SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
- SiFive7GetCyclesDefault<mx>.c);
- let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, CvtCycles)] in
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ if !eq(sew, 32) then {
+ defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtFToFV", SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+ mx, sew, IsWorstCase>;
+ } else {
+ let Latency = 8,
+ AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in
+ defm : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
}
defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListFW>.c;
@@ -933,13 +976,23 @@ multiclass SiFive7WriteResBase<int VLEN,
}
foreach mx = SchedMxListFW in {
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
- defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
- SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
- SiFive7GetCyclesNarrowing<mx>.c);
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
- let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
- defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defvar DefaultCycles = SiFive7GetCyclesNarrowing<mx>.c;
+ if !eq(sew, 32) then {
+ defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ foreach SchedWriteName = ["WriteVFNCvtIToFV", "WriteVFNCvtFToFV"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+ mx, sew, IsWorstCase>;
+ } else {
+ let Latency = 8,
+ AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in {
+ defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
}
}
}
@@ -1499,7 +1552,6 @@ multiclass SiFive7ReadAdvance {
/// eventually be supplied by different SchedMachineModels.
multiclass SiFive7SchedResources<int vlen, bit extraVALU,
SiFive7FPLatencies fpLatencies,
- bit isFP64Throttled,
bit hasFastGather> {
defm SiFive7 : SiFive7ProcResources<extraVALU>;
@@ -1527,8 +1579,7 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU,
: SiFive7WriteResBase<vlen, SiFive7PipeA, SiFive7PipeB, SiFive7PipeAB,
SiFive7IDiv, SiFive7FDiv, SiFive7VA1,
SiFive7VA1OrVA2, SiFive7VL, SiFive7VS,
- SiFive7VCQ, fpLatencies, isFP64Throttled,
- hasFastGather>;
+ SiFive7VCQ, fpLatencies, hasFastGather>;
//===----------------------------------------------------------------------===//
// Bypass and advance
@@ -1560,7 +1611,6 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel {
bit HasExtraVALU = false;
SiFive7FPLatencies FPLatencies;
- bit IsFP64Throttled = false;
bit HasFastGather = false;
string Name = !subst("Model", "", !subst("SiFive7", "", NAME));
@@ -1587,7 +1637,6 @@ def SiFive7VLEN512Model : SiFive7SchedMachineModel<512> {
def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> {
let HasExtraVALU = true;
let FPLatencies = SiFive7LowFPLatencies;
- let IsFP64Throttled = true;
let HasFastGather = true;
}
@@ -1596,7 +1645,6 @@ foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in {
let SchedModel = model in
defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU,
model.FPLatencies,
- model.IsFP64Throttled,
model.HasFastGather>;
}
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index 01a4308..d11b446 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -128,6 +128,22 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred,
IsWorstCase>;
}
+multiclass LMULSEWWriteResMXSEWVariant<string name, SchedPredicateBase Pred,
+ list<ProcResourceKind> predResources,
+ int predLat, list<int> predAcquireCycles,
+ list<int> predReleaseCycles,
+ list<ProcResourceKind> noPredResources,
+ int noPredLat, list<int> noPredAcquireCycles,
+ list<int> noPredReleaseCycles,
+ string mx, int sew, bit IsWorstCase> {
+ defm "" : LMULWriteResVariantImpl<name, name # "_" # mx # "_E" # sew, Pred, predResources,
+ predLat, predAcquireCycles,
+ predReleaseCycles, noPredResources,
+ noPredLat, noPredAcquireCycles,
+ noPredReleaseCycles,
+ IsWorstCase>;
+}
+
// Define multiclasses to define SchedWrite, SchedRead, WriteRes, and
// ReadAdvance for each (name, LMUL) pair and for each LMUL in each of the
// SchedMxList variants above. Each multiclass is responsible for defining
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
index af79070..275165d 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
@@ -184,8 +184,8 @@ void SystemZInstPrinterCommon::printPCRelTLSOperand(const MCInst *MI,
// Output the TLS marker if present.
if ((unsigned)OpNum + 1 < MI->getNumOperands()) {
const MCOperand &MO = MI->getOperand(OpNum + 1);
- const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*MO.getExpr());
- switch (refExp.getSpecifier()) {
+ const MCSymbolRefExpr &RefExp = cast<MCSymbolRefExpr>(*MO.getExpr());
+ switch (RefExp.getSpecifier()) {
case SystemZ::S_TLSGD:
O << ":tls_gdcall:";
break;
@@ -195,7 +195,7 @@ void SystemZInstPrinterCommon::printPCRelTLSOperand(const MCInst *MI,
default:
llvm_unreachable("Unexpected symbol kind");
}
- O << refExp.getSymbol().getName();
+ O << RefExp.getSymbol().getName();
}
}
diff --git a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
index fce6393..8c31579 100644
--- a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -13,10 +13,9 @@
using namespace llvm;
-SystemZConstantPoolValue::
-SystemZConstantPoolValue(const GlobalValue *gv,
- SystemZCP::SystemZCPModifier modifier)
- : MachineConstantPoolValue(gv->getType()), GV(gv), Modifier(modifier) {}
+SystemZConstantPoolValue::SystemZConstantPoolValue(
+ const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier)
+ : MachineConstantPoolValue(GV->getType()), GV(GV), Modifier(Modifier) {}
SystemZConstantPoolValue *
SystemZConstantPoolValue::Create(const GlobalValue *GV,
diff --git a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
index 34d58e0..5313fba 100644
--- a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -352,10 +352,9 @@ int SystemZHazardRecognizer::groupingCost(SUnit *SU) const {
// Similarly, a group-ending SU may either fit well (last in group), or
// end the group prematurely.
if (SC->EndGroup) {
- unsigned resultingGroupSize =
- (CurrGroupSize + getNumDecoderSlots(SU));
- if (resultingGroupSize < 3)
- return (3 - resultingGroupSize);
+ unsigned ResultingGroupSize = (CurrGroupSize + getNumDecoderSlots(SU));
+ if (ResultingGroupSize < 3)
+ return (3 - ResultingGroupSize);
return -1;
}
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 764ff998..4b3ddbd 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -592,10 +592,10 @@ def : Pat<(X86mcvttp2sis (v2f64 (X86VBroadcastld64 addr:$src)),
(VCVTTPD2DQSZ128rmbkz VK2WM:$mask, addr:$src)>;
// Patterns VCVTTPD2UDQSZ128
-def : Pat<(v4i32 (X86cvttp2uis (v2f64 (X86VBroadcastld64 addr:$src)))),
- (VCVTTPD2UDQSZ128rmb addr:$src)>;
def : Pat<(v4i32 (X86cvttp2uis (v2f64 VR128X:$src))),
(VCVTTPD2UDQSZ128rr VR128X:$src)>;
+def : Pat<(v4i32 (X86cvttp2uis (loadv2f64 addr:$src))),
+ (VCVTTPD2UDQSZ128rm addr:$src)>;
def : Pat<(v4i32 (X86cvttp2uis (v2f64 (X86VBroadcastld64 addr:$src)))),
(VCVTTPD2UDQSZ128rmb addr:$src)>;
def : Pat<(X86mcvttp2uis (v2f64 VR128X:$src), (v4i32 VR128X:$src0),
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index 9b9e2ba..9150b58 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -459,7 +459,7 @@ void TruncInstCombine::ReduceExpressionGraph(Type *SclTy) {
Value *Op0 = I->getOperand(0);
Value *LHS = getReducedOperand(I->getOperand(1), SclTy);
Value *RHS = getReducedOperand(I->getOperand(2), SclTy);
- Res = Builder.CreateSelect(Op0, LHS, RHS);
+ Res = Builder.CreateSelect(Op0, LHS, RHS, "", I);
break;
}
case Instruction::PHI: {
diff --git a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
index 9115946..f166fef 100644
--- a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
@@ -24,6 +24,9 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -33,6 +36,11 @@ using namespace llvm;
#define DEBUG_TYPE "coro-annotation-elide"
+static cl::opt<float> CoroElideBranchRatio(
+ "coro-elide-branch-ratio", cl::init(0.55), cl::Hidden,
+ cl::desc("Minimum BranchProbability to consider a elide a coroutine."));
+extern cl::opt<unsigned> MinBlockCounterExecution;
+
static Instruction *getFirstNonAllocaInTheEntryBlock(Function *F) {
for (Instruction &I : F->getEntryBlock())
if (!isa<AllocaInst>(&I))
@@ -145,6 +153,30 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
bool IsCallerPresplitCoroutine = Caller->isPresplitCoroutine();
bool HasAttr = CB->hasFnAttr(llvm::Attribute::CoroElideSafe);
if (IsCallerPresplitCoroutine && HasAttr) {
+ BranchProbability MinBranchProbability(
+ static_cast<int>(CoroElideBranchRatio * MinBlockCounterExecution),
+ MinBlockCounterExecution);
+
+ auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
+
+ auto Prob = BranchProbability::getBranchProbability(
+ BFI.getBlockFreq(CB->getParent()).getFrequency(),
+ BFI.getEntryFreq().getFrequency());
+
+ if (Prob < MinBranchProbability) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE, "CoroAnnotationElideUnlikely", Caller)
+ << "'" << ore::NV("callee", Callee->getName())
+ << "' not elided in '"
+ << ore::NV("caller", Caller->getName())
+ << "' because of low probability: "
+ << ore::NV("probability", Prob) << " (threshold: "
+ << ore::NV("threshold", MinBranchProbability) << ")";
+ });
+ continue;
+ }
+
auto *CallerN = CG.lookup(*Caller);
auto *CallerC = CallerN ? CG.lookupSCC(*CallerN) : nullptr;
// If CallerC is nullptr, it means LazyCallGraph hasn't visited Caller
@@ -156,7 +188,7 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
return OptimizationRemark(DEBUG_TYPE, "CoroAnnotationElide", Caller)
<< "'" << ore::NV("callee", Callee->getName())
<< "' elided in '" << ore::NV("caller", Caller->getName())
- << "'";
+ << "' (probability: " << ore::NV("probability", Prob) << ")";
});
FAM.invalidate(*Caller, PreservedAnalyses::none());
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 2583249..1a00d17 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -109,7 +109,7 @@ static cl::opt<float> MinRegionSizeRatio(
"outline candidate and original function"));
// Used to tune the minimum number of execution counts needed in the predecessor
// block to the cold edge. ie. confidence interval.
-static cl::opt<unsigned>
+cl::opt<unsigned>
MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
cl::desc("Minimum block executions to consider "
"its BranchProbabilityInfo valid"));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 9b272c4..3ddf182 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -28,6 +28,10 @@ using namespace PatternMatch;
#define DEBUG_TYPE "instcombine"
+namespace llvm {
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+}
+
/// This is the complement of getICmpCode, which turns an opcode and two
/// operands into either a constant true or false, or a brand new ICmp
/// instruction. The sign is passed in to determine which kind of predicate to
@@ -1272,7 +1276,8 @@ Value *InstCombinerImpl::foldEqOfParts(Value *Cmp0, Value *Cmp1, bool IsAnd) {
static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
bool IsAnd, bool IsLogical,
InstCombiner::BuilderTy &Builder,
- const SimplifyQuery &Q) {
+ const SimplifyQuery &Q,
+ Instruction &I) {
// Match an equality compare with a non-poison constant as Cmp0.
// Also, give up if the compare can be constant-folded to avoid looping.
CmpPredicate Pred0;
@@ -1306,9 +1311,12 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
return nullptr;
SubstituteCmp = Builder.CreateICmp(Pred1, Y, C);
}
- if (IsLogical)
- return IsAnd ? Builder.CreateLogicalAnd(Cmp0, SubstituteCmp)
- : Builder.CreateLogicalOr(Cmp0, SubstituteCmp);
+ if (IsLogical) {
+ Instruction *MDFrom =
+ ProfcheckDisableMetadataFixes && isa<SelectInst>(I) ? nullptr : &I;
+ return IsAnd ? Builder.CreateLogicalAnd(Cmp0, SubstituteCmp, "", MDFrom)
+ : Builder.CreateLogicalOr(Cmp0, SubstituteCmp, "", MDFrom);
+ }
return Builder.CreateBinOp(IsAnd ? Instruction::And : Instruction::Or, Cmp0,
SubstituteCmp);
}
@@ -3396,13 +3404,13 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
/*IsLogical*/ false, Builder))
return V;
- if (Value *V =
- foldAndOrOfICmpsWithConstEq(LHS, RHS, IsAnd, IsLogical, Builder, Q))
+ if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, IsAnd, IsLogical,
+ Builder, Q, I))
return V;
// We can convert this case to bitwise and, because both operands are used
// on the LHS, and as such poison from both will propagate.
- if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, IsAnd,
- /*IsLogical=*/false, Builder, Q)) {
+ if (Value *V = foldAndOrOfICmpsWithConstEq(
+ RHS, LHS, IsAnd, /*IsLogical=*/false, Builder, Q, I)) {
// If RHS is still used, we should drop samesign flag.
if (IsLogical && RHS->hasSameSign() && !RHS->use_empty()) {
RHS->setSameSign(false);
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 5b8ea15..b74a070 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -1084,8 +1084,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
auto ThenTerm = SplitBlockAndInsertIfThen(
IRB.CreateIsNull(Load), &*IP, false,
MDBuilder(IRB.getContext()).createUnlikelyBranchWeights());
- IRBuilder<> ThenIRB(ThenTerm);
+ InstrumentationIRBuilder ThenIRB(ThenTerm);
auto Store = ThenIRB.CreateStore(ConstantInt::getTrue(Int1Ty), FlagPtr);
+ if (EntryLoc)
+ Store->setDebugLoc(EntryLoc);
Load->setNoSanitizeMetadata();
Store->setNoSanitizeMetadata();
}
@@ -1131,7 +1133,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
EstimatedStackSize >= Options.StackDepthCallbackMin) {
if (InsertBefore)
IRB.SetInsertPoint(InsertBefore);
- IRB.CreateCall(SanCovStackDepthCallback)->setCannotMerge();
+ auto Call = IRB.CreateCall(SanCovStackDepthCallback);
+ if (EntryLoc)
+ Call->setDebugLoc(EntryLoc);
+ Call->setCannotMerge();
}
} else {
// Check stack depth. If it's the deepest so far, record it.
@@ -1144,8 +1149,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
auto ThenTerm = SplitBlockAndInsertIfThen(
IsStackLower, &*IP, false,
MDBuilder(IRB.getContext()).createUnlikelyBranchWeights());
- IRBuilder<> ThenIRB(ThenTerm);
+ InstrumentationIRBuilder ThenIRB(ThenTerm);
auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack);
+ if (EntryLoc)
+ Store->setDebugLoc(EntryLoc);
LowestStack->setNoSanitizeMetadata();
Store->setNoSanitizeMetadata();
}
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index e448230..f4e05a2 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -61,6 +61,7 @@
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DomTreeUpdater.h"
@@ -382,19 +383,28 @@ typedef DenseMap<BasicBlock *, CloneList> DuplicateBlockMap;
typedef MapVector<Instruction *, std::vector<Instruction *>> DefMap;
inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) {
- OS << "< ";
- for (const BasicBlock *BB : Path) {
- std::string BBName;
- if (BB->hasName())
- raw_string_ostream(BBName) << BB->getName();
- else
- raw_string_ostream(BBName) << BB;
- OS << BBName << " ";
- }
- OS << ">";
+ auto BBNames = llvm::map_range(
+ Path, [](const BasicBlock *BB) { return BB->getNameOrAsOperand(); });
+ OS << "< " << llvm::join(BBNames, ", ") << " >";
return OS;
}
+/// Helper to get the successor corresponding to a particular case value for
+/// a switch statement.
+static BasicBlock *getNextCaseSuccessor(SwitchInst *Switch,
+ const APInt &NextState) {
+ BasicBlock *NextCase = nullptr;
+ for (auto Case : Switch->cases()) {
+ if (Case.getCaseValue()->getValue() == NextState) {
+ NextCase = Case.getCaseSuccessor();
+ break;
+ }
+ }
+ if (!NextCase)
+ NextCase = Switch->getDefaultDest();
+ return NextCase;
+}
+
namespace {
/// ThreadingPath is a path in the control flow of a loop that can be threaded
/// by cloning necessary basic blocks and replacing conditional branches with
@@ -407,6 +417,10 @@ struct ThreadingPath {
ExitVal = V->getValue();
IsExitValSet = true;
}
+ void setExitValue(const APInt &V) {
+ ExitVal = V;
+ IsExitValSet = true;
+ }
bool isExitValueSet() const { return IsExitValSet; }
/// Determinator is the basic block that determines the next state of the DFA.
@@ -423,7 +437,7 @@ struct ThreadingPath {
}
void print(raw_ostream &OS) const {
- OS << Path << " [ " << ExitVal << ", " << DBB->getName() << " ]";
+ OS << Path << " [ " << ExitVal << ", " << DBB->getNameOrAsOperand() << " ]";
}
private:
@@ -589,44 +603,8 @@ struct AllSwitchPaths {
BasicBlock *getSwitchBlock() { return SwitchBlock; }
void run() {
- StateDefMap StateDef = getStateDefMap();
- if (StateDef.empty()) {
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable",
- Switch)
- << "Switch instruction is not predictable.";
- });
- return;
- }
-
- auto *SwitchPhi = cast<PHINode>(Switch->getOperand(0));
- auto *SwitchPhiDefBB = SwitchPhi->getParent();
- VisitedBlocks VB;
- // Get paths from the determinator BBs to SwitchPhiDefBB
- std::vector<ThreadingPath> PathsToPhiDef =
- getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths);
- if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) {
- TPaths = std::move(PathsToPhiDef);
- return;
- }
-
- assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty());
- auto PathsLimit = MaxNumPaths / PathsToPhiDef.size();
- // Find and append paths from SwitchPhiDefBB to SwitchBlock.
- PathsType PathsToSwitchBB =
- paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit);
- if (PathsToSwitchBB.empty())
- return;
-
- std::vector<ThreadingPath> TempList;
- for (const ThreadingPath &Path : PathsToPhiDef) {
- for (const PathType &PathToSw : PathsToSwitchBB) {
- ThreadingPath PathCopy(Path);
- PathCopy.appendExcludingFirst(PathToSw);
- TempList.push_back(PathCopy);
- }
- }
- TPaths = std::move(TempList);
+ findTPaths();
+ unifyTPaths();
}
private:
@@ -818,6 +796,69 @@ private:
return Res;
}
+ // Find all threadable paths.
+ void findTPaths() {
+ StateDefMap StateDef = getStateDefMap();
+ if (StateDef.empty()) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable",
+ Switch)
+ << "Switch instruction is not predictable.";
+ });
+ return;
+ }
+
+ auto *SwitchPhi = cast<PHINode>(Switch->getOperand(0));
+ auto *SwitchPhiDefBB = SwitchPhi->getParent();
+ VisitedBlocks VB;
+ // Get paths from the determinator BBs to SwitchPhiDefBB
+ std::vector<ThreadingPath> PathsToPhiDef =
+ getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths);
+ if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) {
+ TPaths = std::move(PathsToPhiDef);
+ return;
+ }
+
+ assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty());
+ auto PathsLimit = MaxNumPaths / PathsToPhiDef.size();
+ // Find and append paths from SwitchPhiDefBB to SwitchBlock.
+ PathsType PathsToSwitchBB =
+ paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit);
+ if (PathsToSwitchBB.empty())
+ return;
+
+ std::vector<ThreadingPath> TempList;
+ for (const ThreadingPath &Path : PathsToPhiDef) {
+ for (const PathType &PathToSw : PathsToSwitchBB) {
+ ThreadingPath PathCopy(Path);
+ PathCopy.appendExcludingFirst(PathToSw);
+ TempList.push_back(PathCopy);
+ }
+ }
+ TPaths = std::move(TempList);
+ }
+
+ // Two states are equivalent if they have the same switch destination.
+ // Unify the states in different threading path if the states are equivalent.
+ void unifyTPaths() {
+ llvm::SmallDenseMap<BasicBlock *, APInt> DestToState;
+ for (ThreadingPath &Path : TPaths) {
+ APInt NextState = Path.getExitValue();
+ BasicBlock *Dest = getNextCaseSuccessor(Switch, NextState);
+ auto StateIt = DestToState.find(Dest);
+ if (StateIt == DestToState.end()) {
+ DestToState.insert({Dest, NextState});
+ continue;
+ }
+
+ if (NextState != StateIt->second) {
+ LLVM_DEBUG(dbgs() << "Next state in " << Path << " is equivalent to "
+ << StateIt->second << "\n");
+ Path.setExitValue(StateIt->second);
+ }
+ }
+ }
+
unsigned NumVisited = 0;
SwitchInst *Switch;
BasicBlock *SwitchBlock;
@@ -1341,21 +1382,6 @@ private:
return It != ClonedBBs.end() ? (*It).BB : nullptr;
}
- /// Helper to get the successor corresponding to a particular case value for
- /// a switch statement.
- BasicBlock *getNextCaseSuccessor(SwitchInst *Switch, const APInt &NextState) {
- BasicBlock *NextCase = nullptr;
- for (auto Case : Switch->cases()) {
- if (Case.getCaseValue()->getValue() == NextState) {
- NextCase = Case.getCaseSuccessor();
- break;
- }
- }
- if (!NextCase)
- NextCase = Switch->getDefaultDest();
- return NextCase;
- }
-
/// Returns true if IncomingBB is a predecessor of BB.
bool isPredecessor(BasicBlock *BB, BasicBlock *IncomingBB) {
return llvm::is_contained(predecessors(BB), IncomingBB);
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 155fcc5..9ac3be1 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5959,7 +5959,11 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
unsigned PreviousEdges = OtherCases->size();
if (OtherDest == SI->getDefaultDest())
++PreviousEdges;
- for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
+ unsigned E = PreviousEdges - 1;
+ // Remove all incoming values from OtherDest if OtherDest is unreachable.
+ if (NewBI->isUnconditional())
+ ++E;
+ for (unsigned I = 0; I != E; ++I)
cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3f16b03..e62d57e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5696,7 +5696,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
Instruction *I = Worklist.pop_back_val();
for (auto &Op : I->operands())
if (auto *InstOp = dyn_cast<Instruction>(Op))
- if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
+ if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) &&
AddrDefs.insert(InstOp).second)
Worklist.push_back(InstOp);
}