aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/ConstantFolding.cpp30
-rw-r--r--llvm/lib/Analysis/IVDescriptors.cpp150
-rw-r--r--llvm/lib/Analysis/LoopAccessAnalysis.cpp5
-rw-r--r--llvm/lib/Analysis/LoopCacheAnalysis.cpp81
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp14
-rw-r--r--llvm/lib/Analysis/VectorUtils.cpp17
-rw-r--r--llvm/lib/BinaryFormat/Magic.cpp5
-rw-r--r--llvm/lib/CodeGen/MachineFunction.cpp30
-rw-r--r--llvm/lib/CodeGen/ReplaceWithVeclib.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp11
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp39
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp4
-rw-r--r--llvm/lib/IR/Attributes.cpp11
-rw-r--r--llvm/lib/IR/Constants.cpp11
-rw-r--r--llvm/lib/Object/Binary.cpp1
-rw-r--r--llvm/lib/Object/CMakeLists.txt1
-rw-r--r--llvm/lib/Object/DXContainer.cpp44
-rw-r--r--llvm/lib/Object/ObjectFile.cpp1
-rw-r--r--llvm/lib/Support/CommandLine.cpp67
-rw-r--r--llvm/lib/Support/Host.cpp6
-rw-r--r--llvm/lib/Support/Windows/Process.inc5
-rw-r--r--llvm/lib/Target/AArch64/AArch64.td18
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp18
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedAmpere1.td1136
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td25
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedPredicates.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp6
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h1
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp43
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp126
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h2
-rw-r--r--llvm/lib/Target/ARM/ARMFrameLowering.cpp5
-rw-r--r--llvm/lib/Target/PowerPC/PPC.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCV.h2
-rw-r--r--llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp11
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp5
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoD.td10
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoF.td42
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td10
-rw-r--r--llvm/lib/Target/RISCV/RISCVMCInstLower.cpp4
-rw-r--r--llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp11
-rw-r--r--llvm/lib/Target/X86/X86InstrAMX.td7
-rw-r--r--llvm/lib/Target/X86/X86LowerAMXType.cpp4
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeClient.td6
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeServer.td12
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver2.td15
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp17
-rw-r--r--llvm/lib/Transforms/IPO/ArgumentPromotion.cpp10
-rw-r--r--llvm/lib/Transforms/IPO/Attributor.cpp10
-rw-r--r--llvm/lib/Transforms/IPO/GlobalOpt.cpp3
-rw-r--r--llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp6
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfiler.cpp31
-rw-r--r--llvm/lib/Transforms/Scalar/GVNSink.cpp9
-rw-r--r--llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp14
-rw-r--r--llvm/lib/Transforms/Scalar/Scalarizer.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp29
-rw-r--r--llvm/lib/Transforms/Utils/BuildLibCalls.cpp320
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp7
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp176
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp114
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp40
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp865
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp3
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp21
68 files changed, 3029 insertions, 718 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 299ea33..b3f5b12 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -866,21 +866,6 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
Type *IntIdxTy = DL.getIndexType(Ptr->getType());
- // If this is "gep i8* Ptr, (sub 0, V)", fold this as:
- // "inttoptr (sub (ptrtoint Ptr), V)"
- if (Ops.size() == 2 && ResElemTy->isIntegerTy(8)) {
- auto *CE = dyn_cast<ConstantExpr>(Ops[1]);
- assert((!CE || CE->getType() == IntIdxTy) &&
- "CastGEPIndices didn't canonicalize index types!");
- if (CE && CE->getOpcode() == Instruction::Sub &&
- CE->getOperand(0)->isNullValue()) {
- Constant *Res = ConstantExpr::getPtrToInt(Ptr, CE->getType());
- Res = ConstantExpr::getSub(Res, CE->getOperand(1));
- Res = ConstantExpr::getIntToPtr(Res, ResTy);
- return ConstantFoldConstant(Res, DL, TLI);
- }
- }
-
for (unsigned i = 1, e = Ops.size(); i != e; ++i)
if (!isa<ConstantInt>(Ops[i]))
return nullptr;
@@ -1336,6 +1321,19 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
DL, BaseOffset, /*AllowNonInbounds=*/true));
if (Base->isNullValue()) {
FoldedValue = ConstantInt::get(CE->getContext(), BaseOffset);
+ } else {
+ // ptrtoint (gep i8, Ptr, (sub 0, V)) -> sub (ptrtoint Ptr), V
+ if (GEP->getNumIndices() == 1 &&
+ GEP->getSourceElementType()->isIntegerTy(8)) {
+ auto *Ptr = cast<Constant>(GEP->getPointerOperand());
+ auto *Sub = dyn_cast<ConstantExpr>(GEP->getOperand(1));
+ Type *IntIdxTy = DL.getIndexType(Ptr->getType());
+ if (Sub && Sub->getType() == IntIdxTy &&
+ Sub->getOpcode() == Instruction::Sub &&
+ Sub->getOperand(0)->isNullValue())
+ FoldedValue = ConstantExpr::getSub(
+ ConstantExpr::getPtrToInt(Ptr, IntIdxTy), Sub->getOperand(1));
+ }
}
}
if (FoldedValue) {
@@ -3038,7 +3036,7 @@ static Constant *ConstantFoldFixedVectorCall(
// Gather a column of constants.
for (unsigned J = 0, JE = Operands.size(); J != JE; ++J) {
// Some intrinsics use a scalar type for certain arguments.
- if (hasVectorIntrinsicScalarOpd(IntrinsicID, J)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, J)) {
Lane[J] = Operands[J];
continue;
}
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index e03cf6c..e4d706a 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -227,12 +227,10 @@ static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst,
return true;
}
-bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
- Loop *TheLoop, FastMathFlags FuncFMF,
- RecurrenceDescriptor &RedDes,
- DemandedBits *DB,
- AssumptionCache *AC,
- DominatorTree *DT) {
+bool RecurrenceDescriptor::AddReductionVar(
+ PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF,
+ RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC,
+ DominatorTree *DT, ScalarEvolution *SE) {
if (Phi->getNumIncomingValues() != 2)
return false;
@@ -249,6 +247,12 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
// This includes users of the reduction, variables (which form a cycle
// which ends in the phi node).
Instruction *ExitInstruction = nullptr;
+
+ // Variable to keep last visited store instruction. By the end of the
+ // algorithm this variable will be either empty or having intermediate
+ // reduction value stored in invariant address.
+ StoreInst *IntermediateStore = nullptr;
+
// Indicates that we found a reduction operation in our scan.
bool FoundReduxOp = false;
@@ -314,6 +318,10 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
// - By instructions outside of the loop (safe).
// * One value may have several outside users, but all outside
// uses must be of the same value.
+ // - By store instructions with a loop invariant address (safe with
+ // the following restrictions):
+ // * If there are several stores, all must have the same address.
+ // * Final value should be stored in that loop invariant address.
// - By an instruction that is not part of the reduction (not safe).
// This is either:
// * An instruction type other than PHI or the reduction operation.
@@ -321,6 +329,43 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
while (!Worklist.empty()) {
Instruction *Cur = Worklist.pop_back_val();
+ // Store instructions are allowed iff it is the store of the reduction
+ // value to the same loop invariant memory location.
+ if (auto *SI = dyn_cast<StoreInst>(Cur)) {
+ if (!SE) {
+ LLVM_DEBUG(dbgs() << "Store instructions are not processed without "
+ << "Scalar Evolution Analysis\n");
+ return false;
+ }
+
+ const SCEV *PtrScev = SE->getSCEV(SI->getPointerOperand());
+ // Check it is the same address as previous stores
+ if (IntermediateStore) {
+ const SCEV *OtherScev =
+ SE->getSCEV(IntermediateStore->getPointerOperand());
+
+ if (OtherScev != PtrScev) {
+ LLVM_DEBUG(dbgs() << "Storing reduction value to different addresses "
+ << "inside the loop: " << *SI->getPointerOperand()
+ << " and "
+ << *IntermediateStore->getPointerOperand() << '\n');
+ return false;
+ }
+ }
+
+ // Check the pointer is loop invariant
+ if (!SE->isLoopInvariant(PtrScev, TheLoop)) {
+ LLVM_DEBUG(dbgs() << "Storing reduction value to non-uniform address "
+ << "inside the loop: " << *SI->getPointerOperand()
+ << '\n');
+ return false;
+ }
+
+ // IntermediateStore is always the last store in the loop.
+ IntermediateStore = SI;
+ continue;
+ }
+
// No Users.
// If the instruction has no users then this is a broken chain and can't be
// a reduction variable.
@@ -443,10 +488,17 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
// reductions which are represented as a cmp followed by a select.
InstDesc IgnoredVal(false, nullptr);
if (VisitedInsts.insert(UI).second) {
- if (isa<PHINode>(UI))
+ if (isa<PHINode>(UI)) {
PHIs.push_back(UI);
- else
+ } else {
+ StoreInst *SI = dyn_cast<StoreInst>(UI);
+ if (SI && SI->getPointerOperand() == Cur) {
+ // Reduction variable chain can only be stored somewhere but it
+ // can't be used as an address.
+ return false;
+ }
NonPHIs.push_back(UI);
+ }
} else if (!isa<PHINode>(UI) &&
((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) &&
!isa<SelectInst>(UI)) ||
@@ -474,6 +526,32 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1)
return false;
+ if (IntermediateStore) {
+ // Check that stored value goes to the phi node again. This way we make sure
+ // that the value stored in IntermediateStore is indeed the final reduction
+ // value.
+ if (!is_contained(Phi->operands(), IntermediateStore->getValueOperand())) {
+ LLVM_DEBUG(dbgs() << "Not a final reduction value stored: "
+ << *IntermediateStore << '\n');
+ return false;
+ }
+
+ // If there is an exit instruction it's value should be stored in
+ // IntermediateStore
+ if (ExitInstruction &&
+ IntermediateStore->getValueOperand() != ExitInstruction) {
+ LLVM_DEBUG(dbgs() << "Last store Instruction of reduction value does not "
+ "store last calculated value of the reduction: "
+ << *IntermediateStore << '\n');
+ return false;
+ }
+
+ // If all uses are inside the loop (intermediate stores), then the
+ // reduction value after the loop will be the one used in the last store.
+ if (!ExitInstruction)
+ ExitInstruction = cast<Instruction>(IntermediateStore->getValueOperand());
+ }
+
if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
return false;
@@ -535,9 +613,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
// is saved as part of the RecurrenceDescriptor.
// Save the description of this reduction variable.
- RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ExactFPMathInst,
- RecurrenceType, IsSigned, IsOrdered, CastInsts,
- MinWidthCastToRecurrenceType);
+ RecurrenceDescriptor RD(RdxStart, ExitInstruction, IntermediateStore, Kind,
+ FMF, ExactFPMathInst, RecurrenceType, IsSigned,
+ IsOrdered, CastInsts, MinWidthCastToRecurrenceType);
RedDes = RD;
return true;
@@ -761,7 +839,8 @@ bool RecurrenceDescriptor::hasMultipleUsesOf(
bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
RecurrenceDescriptor &RedDes,
DemandedBits *DB, AssumptionCache *AC,
- DominatorTree *DT) {
+ DominatorTree *DT,
+ ScalarEvolution *SE) {
BasicBlock *Header = TheLoop->getHeader();
Function &F = *Header->getParent();
FastMathFlags FMF;
@@ -770,72 +849,85 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
FMF.setNoSignedZeros(
F.getFnAttribute("no-signed-zeros-fp-math").getValueAsBool());
- if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a SMAX reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a SMIN reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a UMAX reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n");
return true;
}
if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC,
- DT)) {
+ DT, SE)) {
LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI."
<< *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a float MAX reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n");
return true;
}
if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC,
- DT)) {
+ DT, SE)) {
LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI."
<< " PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC,
- DT)) {
+ if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n");
return true;
}
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index b1773db..d0276df 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1993,9 +1993,12 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
for (StoreInst *ST : Stores) {
Value *Ptr = ST->getPointerOperand();
- if (isUniform(Ptr))
+ if (isUniform(Ptr)) {
+ // Record store instructions to loop invariant addresses
+ StoresToInvariantAddresses.push_back(ST);
HasDependenceInvolvingLoopInvariantAddress |=
!UniformStores.insert(Ptr).second;
+ }
// If we did *not* see this pointer before, insert it to the read-write
// list. At this phase it is only a 'write' list.
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index b7806b3..eacd2621 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -103,14 +103,24 @@ static bool isOneDimensionalArray(const SCEV &AccessFn, const SCEV &ElemSize,
return StepRec == &ElemSize;
}
-/// Compute the trip count for the given loop \p L. Return the SCEV expression
-/// for the trip count or nullptr if it cannot be computed.
-static const SCEV *computeTripCount(const Loop &L, ScalarEvolution &SE) {
+/// Compute the trip count for the given loop \p L or assume a default value if
+/// it is not a compile time constant. Return the SCEV expression for the trip
+/// count.
+static const SCEV *computeTripCount(const Loop &L, const SCEV &ElemSize,
+ ScalarEvolution &SE) {
const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(&L);
- if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
- !isa<SCEVConstant>(BackedgeTakenCount))
- return nullptr;
- return SE.getTripCountFromExitCount(BackedgeTakenCount);
+ const SCEV *TripCount = (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
+ isa<SCEVConstant>(BackedgeTakenCount))
+ ? SE.getTripCountFromExitCount(BackedgeTakenCount)
+ : nullptr;
+
+ if (!TripCount) {
+ LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
+ << " could not be computed, using DefaultTripCount\n");
+ TripCount = SE.getConstant(ElemSize.getType(), DefaultTripCount);
+ }
+
+ return TripCount;
}
//===----------------------------------------------------------------------===//
@@ -274,22 +284,18 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
return 1;
}
- const SCEV *TripCount = computeTripCount(L, SE);
- if (!TripCount) {
- LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
- << " could not be computed, using DefaultTripCount\n");
- const SCEV *ElemSize = Sizes.back();
- TripCount = SE.getConstant(ElemSize->getType(), DefaultTripCount);
- }
+ const SCEV *TripCount = computeTripCount(L, *Sizes.back(), SE);
+ assert(TripCount && "Expecting valid TripCount");
LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n");
- // If the indexed reference is 'consecutive' the cost is
- // (TripCount*Stride)/CLS, otherwise the cost is TripCount.
- const SCEV *RefCost = TripCount;
-
+ const SCEV *RefCost = nullptr;
if (isConsecutive(L, CLS)) {
+ // If the indexed reference is 'consecutive' the cost is
+ // (TripCount*Stride)/CLS.
const SCEV *Coeff = getLastCoefficient();
const SCEV *ElemSize = Sizes.back();
+ assert(Coeff->getType() == ElemSize->getType() &&
+ "Expecting the same type");
const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType());
const SCEV *CacheLineSize = SE.getConstant(WiderType, CLS);
@@ -303,10 +309,33 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
LLVM_DEBUG(dbgs().indent(4)
<< "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
<< *RefCost << "\n");
- } else
+ } else {
+ // If the indexed reference is not 'consecutive' the cost is proportional to
+ // the trip count and the depth of the dimension which the subject loop
+ // subscript is accessing. We try to estimate this by multiplying the cost
+ // by the trip counts of loops corresponding to the inner dimensions. For
+ // example, given the indexed reference 'A[i][j][k]', and assuming the
+ // i-loop is in the innermost position, the cost would be equal to the
+ // iterations of the i-loop multiplied by iterations of the j-loop.
+ RefCost = TripCount;
+
+ int Index = getSubscriptIndex(L);
+ assert(Index >= 0 && "Cound not locate a valid Index");
+
+ for (unsigned I = Index + 1; I < getNumSubscripts() - 1; ++I) {
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(I));
+ assert(AR && AR->getLoop() && "Expecting valid loop");
+ const SCEV *TripCount =
+ computeTripCount(*AR->getLoop(), *Sizes.back(), SE);
+ Type *WiderType = SE.getWiderType(RefCost->getType(), TripCount->getType());
+ RefCost = SE.getMulExpr(SE.getNoopOrAnyExtend(RefCost, WiderType),
+ SE.getNoopOrAnyExtend(TripCount, WiderType));
+ }
+
LLVM_DEBUG(dbgs().indent(4)
- << "Access is not consecutive: RefCost=TripCount=" << *RefCost
- << "\n");
+ << "Access is not consecutive: RefCost=" << *RefCost << "\n");
+ }
+ assert(RefCost && "Expecting a valid RefCost");
// Attempt to fold RefCost into a constant.
if (auto ConstantCost = dyn_cast<SCEVConstant>(RefCost))
@@ -481,6 +510,16 @@ bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const {
return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize);
}
+int IndexedReference::getSubscriptIndex(const Loop &L) const {
+ for (auto Idx : seq<int>(0, getNumSubscripts())) {
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(Idx));
+ if (AR && AR->getLoop() == &L) {
+ return Idx;
+ }
+ }
+ return -1;
+}
+
const SCEV *IndexedReference::getLastCoefficient() const {
const SCEV *LastSubscript = getLastSubscript();
auto *AR = cast<SCEVAddRecExpr>(LastSubscript);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 75381f5..0144ce4 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -282,6 +282,20 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
match(LHS, m_c_And(m_Specific(M), m_Value())))
return true;
}
+
+ // X op (Y & ~X)
+ if (match(RHS, m_c_And(m_Not(m_Specific(LHS)), m_Value())) ||
+ match(LHS, m_c_And(m_Not(m_Specific(RHS)), m_Value())))
+ return true;
+
+ // X op ((X & Y) ^ Y) -- this is the canonical form of the previous pattern
+ // for constant Y.
+ Value *Y;
+ if (match(RHS,
+ m_c_Xor(m_c_And(m_Specific(LHS), m_Value(Y)), m_Deferred(Y))) ||
+ match(LHS, m_c_Xor(m_c_And(m_Specific(RHS), m_Value(Y)), m_Deferred(Y))))
+ return true;
+
// Look for: (A & B) op ~(A | B)
{
Value *A, *B;
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 5f8fa13..a53b216 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -40,7 +40,7 @@ static cl::opt<unsigned> MaxInterleaveGroupFactor(
/// Return true if all of the intrinsic's arguments and return type are scalars
/// for the scalar form of the intrinsic, and vectors for the vector form of the
/// intrinsic (except operands that are marked as always being scalar by
-/// hasVectorIntrinsicScalarOpd).
+/// isVectorIntrinsicWithScalarOpAtArg).
bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
switch (ID) {
case Intrinsic::abs: // Begin integer bit-manipulation.
@@ -89,6 +89,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::fmuladd:
case Intrinsic::powi:
case Intrinsic::canonicalize:
+ case Intrinsic::fptosi_sat:
+ case Intrinsic::fptoui_sat:
return true;
default:
return false;
@@ -96,8 +98,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
}
/// Identifies if the vector form of the intrinsic has a scalar operand.
-bool llvm::hasVectorIntrinsicScalarOpd(Intrinsic::ID ID,
- unsigned ScalarOpdIdx) {
+bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
+ unsigned ScalarOpdIdx) {
switch (ID) {
case Intrinsic::abs:
case Intrinsic::ctlz:
@@ -114,11 +116,14 @@ bool llvm::hasVectorIntrinsicScalarOpd(Intrinsic::ID ID,
}
}
-bool llvm::hasVectorIntrinsicOverloadedScalarOpd(Intrinsic::ID ID,
- unsigned ScalarOpdIdx) {
+bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
+ unsigned OpdIdx) {
switch (ID) {
+ case Intrinsic::fptosi_sat:
+ case Intrinsic::fptoui_sat:
+ return OpdIdx == 0;
case Intrinsic::powi:
- return (ScalarOpdIdx == 1);
+ return OpdIdx == 1;
default:
return false;
}
diff --git a/llvm/lib/BinaryFormat/Magic.cpp b/llvm/lib/BinaryFormat/Magic.cpp
index 5d999a90..d48adb1 100644
--- a/llvm/lib/BinaryFormat/Magic.cpp
+++ b/llvm/lib/BinaryFormat/Magic.cpp
@@ -225,6 +225,11 @@ file_magic llvm::identify_magic(StringRef Magic) {
if (startswith(Magic, "--- !tapi") || startswith(Magic, "---\narchs:"))
return file_magic::tapi_file;
break;
+
+ case 'D': // DirectX container file - DXBC
+ if (startswith(Magic, "DXBC") && Magic.size() == 4)
+ return file_magic::dxcontainer_object;
+ break;
default:
break;
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 06830e8..f2b0024 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -1033,7 +1033,32 @@ void MachineFunction::substituteDebugValuesForInst(const MachineInstr &Old,
}
}
-auto MachineFunction::salvageCopySSA(MachineInstr &MI)
+auto MachineFunction::salvageCopySSA(
+ MachineInstr &MI, DenseMap<Register, DebugInstrOperandPair> &DbgPHICache)
+ -> DebugInstrOperandPair {
+ const TargetInstrInfo &TII = *getSubtarget().getInstrInfo();
+
+ // Check whether this copy-like instruction has already been salvaged into
+ // an operand pair.
+ Register Dest;
+ if (auto CopyDstSrc = TII.isCopyInstr(MI)) {
+ Dest = CopyDstSrc->Destination->getReg();
+ } else {
+ assert(MI.isSubregToReg());
+ Dest = MI.getOperand(0).getReg();
+ }
+
+ auto CacheIt = DbgPHICache.find(Dest);
+ if (CacheIt != DbgPHICache.end())
+ return CacheIt->second;
+
+ // Calculate the instruction number to use, or install a DBG_PHI.
+ auto OperandPair = salvageCopySSAImpl(MI);
+ DbgPHICache.insert({Dest, OperandPair});
+ return OperandPair;
+}
+
+auto MachineFunction::salvageCopySSAImpl(MachineInstr &MI)
-> DebugInstrOperandPair {
MachineRegisterInfo &MRI = getRegInfo();
const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
@@ -1189,6 +1214,7 @@ void MachineFunction::finalizeDebugInstrRefs() {
MI.getOperand(1).ChangeToRegister(0, false);
};
+ DenseMap<Register, DebugInstrOperandPair> ArgDbgPHIs;
for (auto &MBB : *this) {
for (auto &MI : MBB) {
if (!MI.isDebugRef() || !MI.getOperand(0).isReg())
@@ -1211,7 +1237,7 @@ void MachineFunction::finalizeDebugInstrRefs() {
// instruction that defines the source value, see salvageCopySSA docs
// for why this is important.
if (DefMI.isCopyLike() || TII->isCopyInstr(DefMI)) {
- auto Result = salvageCopySSA(DefMI);
+ auto Result = salvageCopySSA(DefMI, ArgDbgPHIs);
MI.getOperand(0).ChangeToImmediate(Result.first);
MI.getOperand(1).setImm(Result.second);
} else {
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 6887347..87b8ac5 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -109,7 +109,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
auto *ArgType = Arg.value()->getType();
// Vector calls to intrinsics can still have
// scalar operands for specific arguments.
- if (hasVectorIntrinsicScalarOpd(IntrinsicID, Arg.index())) {
+ if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) {
ScalarTypes.push_back(ArgType);
} else {
// The argument in this place should be a vector if
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e139cf6..e483c3a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -519,7 +519,9 @@ namespace {
SDValue XformToShuffleWithZero(SDNode *N);
bool reassociationCanBreakAddressingModePattern(unsigned Opc,
- const SDLoc &DL, SDValue N0,
+ const SDLoc &DL,
+ SDNode *N,
+ SDValue N0,
SDValue N1);
SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0,
SDValue N1);
@@ -996,6 +998,7 @@ static bool canSplitIdx(LoadSDNode *LD) {
bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
const SDLoc &DL,
+ SDNode *N,
SDValue N0,
SDValue N1) {
// Currently this only tries to ensure we don't undo the GEP splits done by
@@ -1025,7 +1028,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
return false;
const int64_t CombinedValue = CombinedValueIntVal.getSExtValue();
- for (SDNode *Node : N0->uses()) {
+ for (SDNode *Node : N->uses()) {
auto LoadStore = dyn_cast<MemSDNode>(Node);
if (LoadStore) {
// Is x[offset2] already not a legal addressing mode? If so then
@@ -2447,7 +2450,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
return NewSel;
// reassociate add
- if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N0, N1)) {
+ if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N, N0, N1)) {
if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags()))
return RADD;
@@ -15527,7 +15530,7 @@ static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
// This means this is also safe for a signed input and unsigned output, since
// a negative input would lead to undefined behavior.
unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
- unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned;
+ unsigned OutputSize = (int)VT.getScalarSizeInBits();
unsigned ActualSize = std::min(InputSize, OutputSize);
const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d667988..90e4b5d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4684,26 +4684,33 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
return false;
}
+static bool haveNoCommonBitsSetCommutative(SDValue A, SDValue B) {
+ // Match masked merge pattern (X & ~M) op (Y & M)
+ // Including degenerate case (X & ~M) op M
+ auto MatchNoCommonBitsPattern = [&](SDValue NotM, SDValue Other) {
+ if (isBitwiseNot(NotM, true)) {
+ SDValue NotOperand = NotM->getOperand(0);
+ if (Other == NotOperand)
+ return true;
+ if (Other->getOpcode() == ISD::AND)
+ return NotOperand == Other->getOperand(0) ||
+ NotOperand == Other->getOperand(1);
+ }
+ return false;
+ };
+ if (A->getOpcode() == ISD::AND)
+ return MatchNoCommonBitsPattern(A->getOperand(0), B) ||
+ MatchNoCommonBitsPattern(A->getOperand(1), B);
+ return false;
+}
+
// FIXME: unify with llvm::haveNoCommonBitsSet.
bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
assert(A.getValueType() == B.getValueType() &&
"Values must have the same type");
- // Match masked merge pattern (X & ~M) op (Y & M)
- if (A->getOpcode() == ISD::AND && B->getOpcode() == ISD::AND) {
- auto MatchNoCommonBitsPattern = [&](SDValue NotM, SDValue And) {
- if (isBitwiseNot(NotM, true)) {
- SDValue NotOperand = NotM->getOperand(0);
- return NotOperand == And->getOperand(0) ||
- NotOperand == And->getOperand(1);
- }
- return false;
- };
- if (MatchNoCommonBitsPattern(A->getOperand(0), B) ||
- MatchNoCommonBitsPattern(A->getOperand(1), B) ||
- MatchNoCommonBitsPattern(B->getOperand(0), A) ||
- MatchNoCommonBitsPattern(B->getOperand(1), A))
- return true;
- }
+ if (haveNoCommonBitsSetCommutative(A, B) ||
+ haveNoCommonBitsSetCommutative(B, A))
+ return true;
return KnownBits::haveNoCommonBitsSet(computeKnownBits(A),
computeKnownBits(B));
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9732a17..b209aecf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9044,7 +9044,9 @@ void TargetLowering::expandUADDSUBO(
if (IsAdd && isOneConstant(RHS)) {
// Special case: uaddo X, 1 overflowed if X+1 is 0. This potential reduces
// the live range of X. We assume comparing with 0 is cheap.
- // TODO: This generalizes to (X + C) < C.
+ // The general case (X + C) < C is not necessarily beneficial. Although we
+ // reduce the live range of X, we may introduce the materialization of
+ // constant C.
SetCC =
DAG.getSetCC(dl, SetCCType, Result,
DAG.getConstant(0, dl, Node->getValueType(0)), ISD::SETEQ);
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 3cade80..1aa2d44 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -2029,3 +2029,14 @@ void AttributeFuncs::mergeAttributesForOutlining(Function &Base,
// that aspect in the merged function.
mergeFnAttrs(Base, ToMerge);
}
+
+void AttributeFuncs::updateMinLegalVectorWidthAttr(Function &Fn,
+ uint64_t Width) {
+ Attribute Attr = Fn.getFnAttribute("min-legal-vector-width");
+ if (Attr.isValid()) {
+ uint64_t OldWidth;
+ Attr.getValueAsString().getAsInteger(0, OldWidth);
+ if (Width > OldWidth)
+ Fn.addFnAttr("min-legal-vector-width", llvm::utostr(Width));
+ }
+}
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 5dcf1ba..c182513 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -2068,6 +2068,17 @@ Constant *ConstantExpr::getTruncOrBitCast(Constant *C, Type *Ty) {
return getTrunc(C, Ty);
}
+Constant *ConstantExpr::getSExtOrTrunc(Constant *C, Type *Ty) {
+ assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() &&
+ "Can only sign extend/truncate integers!");
+ Type *CTy = C->getType();
+ if (CTy->getScalarSizeInBits() < Ty->getScalarSizeInBits())
+ return getSExt(C, Ty);
+ if (CTy->getScalarSizeInBits() > Ty->getScalarSizeInBits())
+ return getTrunc(C, Ty);
+ return C;
+}
+
Constant *ConstantExpr::getPointerCast(Constant *S, Type *Ty) {
assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
assert((Ty->isIntOrIntVectorTy() || Ty->isPtrOrPtrVectorTy()) &&
diff --git a/llvm/lib/Object/Binary.cpp b/llvm/lib/Object/Binary.cpp
index 67ed44a..1703f76 100644
--- a/llvm/lib/Object/Binary.cpp
+++ b/llvm/lib/Object/Binary.cpp
@@ -84,6 +84,7 @@ Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
case file_magic::unknown:
case file_magic::cuda_fatbinary:
case file_magic::coff_cl_gl_object:
+ case file_magic::dxcontainer_object:
// Unrecognized object file format.
return errorCodeToError(object_error::invalid_file_type);
case file_magic::minidump:
diff --git a/llvm/lib/Object/CMakeLists.txt b/llvm/lib/Object/CMakeLists.txt
index 0825210..ba612e3 100644
--- a/llvm/lib/Object/CMakeLists.txt
+++ b/llvm/lib/Object/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMObject
COFFModuleDefinition.cpp
COFFObjectFile.cpp
Decompressor.cpp
+ DXContainer.cpp
ELF.cpp
ELFObjectFile.cpp
Error.cpp
diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp
new file mode 100644
index 0000000..e1aea562
--- /dev/null
+++ b/llvm/lib/Object/DXContainer.cpp
@@ -0,0 +1,44 @@
+//===- DXContainer.cpp - DXContainer object file implementation -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/DXContainer.h"
+#include "llvm/BinaryFormat/DXContainer.h"
+#include "llvm/Object/Error.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+static Error parseFailed(const Twine &Msg) {
+ return make_error<GenericBinaryError>(Msg.str(), object_error::parse_failed);
+}
+
+template <typename T>
+static Error readStruct(StringRef Buffer, const char *P, T &Struct) {
+ // Don't read before the beginning or past the end of the file
+ if (P < Buffer.begin() || P + sizeof(T) > Buffer.end())
+ return parseFailed("Reading structure out of file bounds");
+
+ memcpy(&Struct, P, sizeof(T));
+ // DXContainer is always BigEndian
+ if (sys::IsBigEndianHost)
+ Struct.byteSwap();
+ return Error::success();
+}
+
+DXContainer::DXContainer(MemoryBufferRef O) : Data(O) {}
+
+Error DXContainer::parseHeader() {
+ return readStruct(Data.getBuffer(), Data.getBuffer().data(), Header);
+}
+
+Expected<DXContainer> DXContainer::create(MemoryBufferRef Object) {
+ DXContainer Container(Object);
+ if (Error Err = Container.parseHeader())
+ return std::move(Err);
+ return Container;
+}
diff --git a/llvm/lib/Object/ObjectFile.cpp b/llvm/lib/Object/ObjectFile.cpp
index fed6726..609dfae 100644
--- a/llvm/lib/Object/ObjectFile.cpp
+++ b/llvm/lib/Object/ObjectFile.cpp
@@ -147,6 +147,7 @@ ObjectFile::createObjectFile(MemoryBufferRef Object, file_magic Type,
case file_magic::minidump:
case file_magic::goff_object:
case file_magic::cuda_fatbinary:
+ case file_magic::dxcontainer_object:
return errorCodeToError(object_error::invalid_file_type);
case file_magic::tapi_file:
return errorCodeToError(object_error::invalid_file_type);
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 4c92502..3e5fff9 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -918,21 +918,34 @@ static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) {
return I - 1;
}
-// Windows treats whitespace, double quotes, and backslashes specially.
+// Windows treats whitespace, double quotes, and backslashes specially, except
+// when parsing the first token of a full command line, in which case
+// backslashes are not special.
static bool isWindowsSpecialChar(char C) {
return isWhitespaceOrNull(C) || C == '\\' || C == '\"';
}
+static bool isWindowsSpecialCharInCommandName(char C) {
+ return isWhitespaceOrNull(C) || C == '\"';
+}
// Windows tokenization implementation. The implementation is designed to be
// inlined and specialized for the two user entry points.
-static inline void
-tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
- function_ref<void(StringRef)> AddToken,
- bool AlwaysCopy, function_ref<void()> MarkEOL) {
+static inline void tokenizeWindowsCommandLineImpl(
+ StringRef Src, StringSaver &Saver, function_ref<void(StringRef)> AddToken,
+ bool AlwaysCopy, function_ref<void()> MarkEOL, bool InitialCommandName) {
SmallString<128> Token;
+ // Sometimes, this function will be handling a full command line including an
+ // executable pathname at the start. In that situation, the initial pathname
+ // needs different handling from the following arguments, because when
+ // CreateProcess or cmd.exe scans the pathname, it doesn't treat \ as
+ // escaping the quote character, whereas when libc scans the rest of the
+ // command line, it does.
+ bool CommandName = InitialCommandName;
+
// Try to do as much work inside the state machine as possible.
enum { INIT, UNQUOTED, QUOTED } State = INIT;
+
for (size_t I = 0, E = Src.size(); I < E; ++I) {
switch (State) {
case INIT: {
@@ -947,19 +960,29 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
if (I >= E)
break;
size_t Start = I;
- while (I < E && !isWindowsSpecialChar(Src[I]))
- ++I;
+ if (CommandName) {
+ while (I < E && !isWindowsSpecialCharInCommandName(Src[I]))
+ ++I;
+ } else {
+ while (I < E && !isWindowsSpecialChar(Src[I]))
+ ++I;
+ }
StringRef NormalChars = Src.slice(Start, I);
if (I >= E || isWhitespaceOrNull(Src[I])) {
// No special characters: slice out the substring and start the next
// token. Copy the string if the caller asks us to.
AddToken(AlwaysCopy ? Saver.save(NormalChars) : NormalChars);
- if (I < E && Src[I] == '\n')
+ if (I < E && Src[I] == '\n') {
MarkEOL();
+ CommandName = InitialCommandName;
+ } else {
+ CommandName = false;
+ }
} else if (Src[I] == '\"') {
Token += NormalChars;
State = QUOTED;
} else if (Src[I] == '\\') {
+ assert(!CommandName && "or else we'd have treated it as a normal char");
Token += NormalChars;
I = parseBackslash(Src, I, Token);
State = UNQUOTED;
@@ -976,12 +999,16 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
// token.
AddToken(Saver.save(Token.str()));
Token.clear();
- if (Src[I] == '\n')
+ if (Src[I] == '\n') {
+ CommandName = InitialCommandName;
MarkEOL();
+ } else {
+ CommandName = false;
+ }
State = INIT;
} else if (Src[I] == '\"') {
State = QUOTED;
- } else if (Src[I] == '\\') {
+ } else if (Src[I] == '\\' && !CommandName) {
I = parseBackslash(Src, I, Token);
} else {
Token.push_back(Src[I]);
@@ -999,7 +1026,7 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
// Otherwise, end the quoted portion and return to the unquoted state.
State = UNQUOTED;
}
- } else if (Src[I] == '\\') {
+ } else if (Src[I] == '\\' && !CommandName) {
I = parseBackslash(Src, I, Token);
} else {
Token.push_back(Src[I]);
@@ -1008,7 +1035,7 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
}
}
- if (State == UNQUOTED)
+ if (State != INIT)
AddToken(Saver.save(Token.str()));
}
@@ -1021,7 +1048,7 @@ void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
NewArgv.push_back(nullptr);
};
tokenizeWindowsCommandLineImpl(Src, Saver, AddToken,
- /*AlwaysCopy=*/true, OnEOL);
+ /*AlwaysCopy=*/true, OnEOL, false);
}
void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver,
@@ -1029,7 +1056,19 @@ void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver,
auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok); };
auto OnEOL = []() {};
tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, /*AlwaysCopy=*/false,
- OnEOL);
+ OnEOL, false);
+}
+
+void cl::TokenizeWindowsCommandLineFull(StringRef Src, StringSaver &Saver,
+ SmallVectorImpl<const char *> &NewArgv,
+ bool MarkEOLs) {
+ auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok.data()); };
+ auto OnEOL = [&]() {
+ if (MarkEOLs)
+ NewArgv.push_back(nullptr);
+ };
+ tokenizeWindowsCommandLineImpl(Src, Saver, AddToken,
+ /*AlwaysCopy=*/true, OnEOL, true);
}
void cl::tokenizeConfigFile(StringRef Source, StringSaver &Saver,
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index 98272bb..976599f 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -296,6 +296,12 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
}
}
+ if (Implementer == "0xc0") { // Ampere Computing
+ return StringSwitch<const char *>(Part)
+ .Case("0xac3", "ampere1")
+ .Default("generic");
+ }
+
return "generic";
}
diff --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc
index dfaab16..e415674 100644
--- a/llvm/lib/Support/Windows/Process.inc
+++ b/llvm/lib/Support/Windows/Process.inc
@@ -247,7 +247,7 @@ windows::GetCommandLineArguments(SmallVectorImpl<const char *> &Args,
SmallVector<const char *, 20> TmpArgs;
StringSaver Saver(Alloc);
- cl::TokenizeWindowsCommandLine(Cmd, Saver, TmpArgs, /*MarkEOLs=*/false);
+ cl::TokenizeWindowsCommandLineFull(Cmd, Saver, TmpArgs, /*MarkEOLs=*/false);
for (const char *Arg : TmpArgs) {
EC = WildcardExpand(Arg, Args, Saver);
@@ -255,6 +255,9 @@ windows::GetCommandLineArguments(SmallVectorImpl<const char *> &Args,
return EC;
}
+ if (Args.size() == 0)
+ return std::make_error_code(std::errc::invalid_argument);
+
SmallVector<char, MAX_PATH> Arg0(Args[0], Args[0] + strlen(Args[0]));
SmallVector<char, MAX_PATH> Filename;
sys::path::remove_filename(Arg0);
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index bd6deb5..2682b9b 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -567,6 +567,7 @@ include "AArch64Schedule.td"
include "AArch64InstrInfo.td"
include "AArch64SchedPredicates.td"
include "AArch64SchedPredExynos.td"
+include "AArch64SchedPredAmpere.td"
include "AArch64Combine.td"
def AArch64InstrInfo : InstrInfo;
@@ -636,6 +637,7 @@ include "AArch64SchedThunderX2T99.td"
include "AArch64SchedA64FX.td"
include "AArch64SchedThunderX3T110.td"
include "AArch64SchedTSV110.td"
+include "AArch64SchedAmpere1.td"
def TuneA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
"Cortex-A35 ARM processors">;
@@ -956,6 +958,16 @@ def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
FeatureFuseAES,
FeaturePostRAScheduler]>;
+def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
+ "Ampere Computing Ampere-1 processors", [
+ FeaturePostRAScheduler,
+ FeatureFuseAES,
+ FeatureLSLFast,
+ FeatureAggressiveFMA,
+ FeatureArithmeticBccFusion,
+ FeatureCmpBccFusion,
+ FeatureFuseAddress,
+ FeatureFuseLiterals]>;
def ProcessorFeatures {
list<SubtargetFeature> A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
@@ -1067,6 +1079,8 @@ def ProcessorFeatures {
list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureSPE,
FeatureFullFP16, FeatureFP16FML, FeatureDotProd];
+ list<SubtargetFeature> Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon,
+ FeatureMTE, FeatureSSBS];
// ETE and TRBE are future architecture extensions. We temporarily enable them
// by default for users targeting generic AArch64. The extensions do not
@@ -1205,6 +1219,10 @@ def : ProcessorModel<"a64fx", A64FXModel, ProcessorFeatures.A64FX,
def : ProcessorModel<"carmel", NoSchedModel, ProcessorFeatures.Carmel,
[TuneCarmel]>;
+// Ampere Computing
+def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1,
+ [TuneAmpere1]>;
+
//===----------------------------------------------------------------------===//
// Assembly parser
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index c367d2d..71911b6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5092,12 +5092,19 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
SDValue &OffImm) {
const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root);
const DataLayout &DL = CurDAG->getDataLayout();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
if (N.getOpcode() == ISD::FrameIndex) {
int FI = cast<FrameIndexSDNode>(N)->getIndex();
- Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
- OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
- return true;
+ // We can only encode VL scaled offsets, so only fold in frame indexes
+ // referencing SVE objects.
+ if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
+ return true;
+ }
+
+ return false;
}
if (MemVT == EVT())
@@ -5124,7 +5131,10 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
Base = N.getOperand(0);
if (Base.getOpcode() == ISD::FrameIndex) {
int FI = cast<FrameIndexSDNode>(Base)->getIndex();
- Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ // We can only encode VL scaled offsets, so only fold in frame indexes
+ // referencing SVE objects.
+ if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector)
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
}
OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
new file mode 100644
index 0000000..32f7299
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
@@ -0,0 +1,1136 @@
+//=- AArch64SchedAmpere1.td - Ampere-1 scheduling def -----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the Ampere Computing Ampere-1 to
+// support instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+// The Ampere-1 core is an out-of-order micro-architecture. The front
+// end has branch prediction, with a 10-cycle recovery time from a
+// mispredicted branch. Instructions coming out of the front end are
+// decoded into internal micro-ops (uops).
+
+def Ampere1Model : SchedMachineModel {
+ let IssueWidth = 4; // 4-way decode and dispatch
+ let MicroOpBufferSize = 174; // micro-op re-order buffer size
+ let LoadLatency = 4; // Optimistic load latency
+ let MispredictPenalty = 10; // Branch mispredict penalty
+ let LoopMicroOpBufferSize = 32; // Instruction queue size
+ let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ SMEUnsupported.F);
+}
+
+let SchedModel = Ampere1Model in {
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Ampere-1.
+// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP,
+// and 2 memory) issue into. The integer and FP schedulers can each issue
+// one uop per cycle, while the memory schedulers can each issue one load
+// and one store address calculation per cycle.
+
+def Ampere1UnitA : ProcResource<2>; // integer single-cycle, branch, and flags r/w
+def Ampere1UnitB : ProcResource<2>; // integer single-cycle, and complex shifts
+def Ampere1UnitBS : ProcResource<1>; // integer multi-cycle
+def Ampere1UnitL : ProcResource<2>; // load
+def Ampere1UnitS : ProcResource<2>; // store address calculation
+def Ampere1UnitX : ProcResource<1>; // FP and vector operations, and flag write
+def Ampere1UnitY : ProcResource<1>; // FP and vector operations, and crypto
+def Ampere1UnitZ : ProcResource<1>; // FP store data and FP-to-integer moves
+
+def Ampere1UnitAB : ProcResGroup<[Ampere1UnitA, Ampere1UnitB]>;
+def Ampere1UnitXY : ProcResGroup<[Ampere1UnitX, Ampere1UnitY]>;
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Ampere-1.
+
+def Ampere1Write_1cyc_1A : SchedWriteRes<[Ampere1UnitA]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_2A : SchedWriteRes<[Ampere1UnitA, Ampere1UnitA]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_1cyc_1B : SchedWriteRes<[Ampere1UnitB]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1AB : SchedWriteRes<[Ampere1UnitAB]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1S : SchedWriteRes<[Ampere1UnitS]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_2S : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
+ let Latency = 2;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_2cyc_2AB : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitAB]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1B_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitAB]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1B_1A : SchedWriteRes<[Ampere1UnitB, Ampere1UnitA]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_1A : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitA]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_2S : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
+ Ampere1UnitS]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_2cyc_1AB_1S_1Z : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
+ Ampere1UnitZ]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_2cyc_1B_1S : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 2;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_2cyc_1S_1Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_3cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_3cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_3cyc_1B_1S_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS,
+ Ampere1UnitAB]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_3cyc_1S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_3cyc_2S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+}
+
+def Ampere1Write_4cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1Z : SchedWriteRes<[Ampere1UnitZ]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_4cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitS, Ampere1UnitZ]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_4cyc_3S_3Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 4;
+ let NumMicroOps = 6;
+}
+
+def Ampere1Write_5cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1L_1BS : SchedWriteRes<[Ampere1UnitL, Ampere1UnitBS]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_4S_4Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 5;
+ let NumMicroOps = 8;
+}
+
+def Ampere1Write_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+}
+
+def Ampere1Write_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+}
+
+def Ampere1Write_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 6;
+ let NumMicroOps = 9;
+}
+
+def Ampere1Write_6cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_6cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_6cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_6cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_6cyc_3L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_6cyc_4L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitL, Ampere1UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def Ampere1Write_6cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_7cyc_1BS_1XY : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitXY]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_1L_1XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitXY]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_2L_2XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+}
+
+def Ampere1Write_7cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 7;
+ let NumMicroOps = 12;
+}
+
+def Ampere1Write_8cyc_1BS_1A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_8cyc_1BS_2A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA,
+ Ampere1UnitA]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_8cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_8cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def Ampere1Write_8cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+}
+
+def Ampere1Write_8cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 8;
+ let NumMicroOps = 8;
+}
+
+def Ampere1Write_9cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+}
+
+def Ampere1Write_9cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 9;
+ let NumMicroOps = 8;
+}
+
+def Ampere1Write_9cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_9cyc_2L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+}
+
+def Ampere1Write_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 9;
+ let NumMicroOps = 14;
+}
+
+def Ampere1Write_9cyc_8XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 9;
+ let NumMicroOps = 16;
+}
+
+def Ampere1Write_10cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_1X_1Z : SchedWriteRes<[Ampere1UnitX, Ampere1UnitZ]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 10;
+ let NumMicroOps = 6;
+}
+
+def Ampere1Write_10cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_10cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_1BS_1L : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitL]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_11cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 11;
+ let NumMicroOps = 12;
+}
+
+def Ampere1Write_12cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 12;
+ let NumMicroOps = 12;
+}
+
+def Ampere1Write_12cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_12cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+}
+
+def Ampere1Write_18cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+ let Latency = 18;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_19cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 19;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_25cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 25;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_32cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 32;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_34cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+ let Latency = 34;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_34cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 34;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_39cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 39;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_62cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 62;
+ let NumMicroOps = 1;
+}
+
+// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4),
+// which are a single uop, and for extended registers, which have full flexibility
+// across Unit A or B for both uops.
+def Ampere1Write_Arith : SchedWriteVariant<[
+ SchedVar<RegExtendedPred, [Ampere1Write_2cyc_2AB]>,
+ SchedVar<AmpereCheapLSL, [Ampere1Write_1cyc_1AB]>,
+ SchedVar<NoSchedPred, [Ampere1Write_2cyc_1B_1AB]>]>;
+
+def Ampere1Write_ArithFlagsetting : SchedWriteVariant<[
+ SchedVar<RegExtendedPred, [Ampere1Write_2cyc_1AB_1A]>,
+ SchedVar<AmpereCheapLSL, [Ampere1Write_1cyc_1A]>,
+ SchedVar<NoSchedPred, [Ampere1Write_2cyc_1B_1A]>]>;
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latencies for Ampere-1.
+// This provides a coarse model, which is then specialised below.
+
+def : WriteRes<WriteImm, [Ampere1UnitAB]>; // MOVN, MOVZ
+def : WriteRes<WriteI, [Ampere1UnitAB]>; // ALU
+def : WriteRes<WriteISReg, [Ampere1UnitB, Ampere1UnitA]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+} // ALU of Shifted-Reg
+def : WriteRes<WriteIEReg, [Ampere1UnitAB, Ampere1UnitA]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+} // ALU of Extended-Reg
+def : WriteRes<WriteExtr, [Ampere1UnitB]>; // EXTR shifts a reg pair
+def : WriteRes<WriteIS, [Ampere1UnitB]>; // Shift/Scale
+def : WriteRes<WriteID32, [Ampere1UnitBS]> {
+ let Latency = 18;
+} // 32-bit Divide
+def : WriteRes<WriteID64, [Ampere1UnitBS]> {
+ let Latency = 34;
+} // 64-bit Divide
+def : WriteRes<WriteIM32, [Ampere1UnitBS]> {
+ let Latency = 3;
+} // 32-bit Multiply
+def : WriteRes<WriteIM64, [Ampere1UnitBS]> {
+ let Latency = 3;
+} // 32-bit Multiply
+def : WriteRes<WriteBr, [Ampere1UnitA]>;
+def : WriteRes<WriteBrReg, [Ampere1UnitA, Ampere1UnitA]>;
+def : WriteRes<WriteLD, [Ampere1UnitL]> {
+ let Latency = 4;
+} // Load from base addr plus immediate offset
+def : WriteRes<WriteST, [Ampere1UnitS]> {
+ let Latency = 1;
+} // Store to base addr plus immediate offset
+def : WriteRes<WriteSTP, [Ampere1UnitS, Ampere1UnitS]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+} // Store a register pair.
+def : WriteRes<WriteAdr, [Ampere1UnitAB]>;
+def : WriteRes<WriteLDIdx, [Ampere1UnitAB, Ampere1UnitS]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+} // Load from a register index (maybe scaled).
+def : WriteRes<WriteSTIdx, [Ampere1UnitS, Ampere1UnitS]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+} // Store to a register index (maybe scaled).
+def : WriteRes<WriteF, [Ampere1UnitXY]> {
+ let Latency = 2;
+} // General floating-point ops.
+def : WriteRes<WriteFCmp, [Ampere1UnitX]> {
+ let Latency = 5;
+} // Floating-point compare.
+def : WriteRes<WriteFCvt, [Ampere1UnitXY]> {
+ let Latency = 6;
+} // Float conversion.
+def : WriteRes<WriteFCopy, [Ampere1UnitXY]> {
+} // Float-int register copy.
+def : WriteRes<WriteFImm, [Ampere1UnitXY]> {
+ let Latency = 2;
+} // Float-int register copy.
+def : WriteRes<WriteFMul, [Ampere1UnitXY]> {
+ let Latency = 5;
+} // Floating-point multiply.
+def : WriteRes<WriteFDiv, [Ampere1UnitXY]> {
+ let Latency = 34;
+} // Floating-point division.
+def : WriteRes<WriteVd, [Ampere1UnitXY]> {
+ let Latency = 3;
+} // 64bit Vector D ops.
+def : WriteRes<WriteVq, [Ampere1UnitXY]> {
+ let Latency = 3;
+} // 128bit Vector Q ops.
+def : WriteRes<WriteVLD, [Ampere1UnitL, Ampere1UnitL]> {
+ let Latency = 5;
+} // Vector loads.
+def : WriteRes<WriteVST, [Ampere1UnitS, Ampere1UnitZ]> {
+ let Latency = 2;
+} // Vector stores.
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi, []> {
+ let Latency = 4;
+} // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP
+
+// Forwarding logic.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 1, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadST, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+//===----------------------------------------------------------------------===//
+// Specialising the scheduling model further for Ampere-1.
+
+def : InstRW<[Ampere1Write_1cyc_1AB], (instrs COPY)>;
+
+// Branch instructions
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs Bcc, BL, RET)>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+ (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
+def : InstRW<[Ampere1Write_1cyc_2A], (instrs BLR)>;
+
+// Cryptography instructions
+// -- AES encryption/decryption
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AES[DE]")>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AESI?MC")>;
+// -- Polynomial multiplication
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^PMUL", "^PMULL")>;
+// -- SHA-256 hash
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA256(H|H2)")>;
+// -- SHA-256 schedule update
+def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA256SU[01]")>;
+// -- SHA-3 instructions
+def : InstRW<[Ampere1Write_2cyc_1XY],
+ (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>;
+// -- SHA-512 hash
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA512(H|H2)")>;
+// -- SHA-512 schedule update
+def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA512SU[01]")>;
+// -- SHA1 choose/majority/parity
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA1[CMP]")>;
+// -- SHA1 hash/schedule update
+def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1SU[01]")>;
+def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1H")>;
+
+// FP and vector load instructions
+// -- Load 1-element structure to one/all lanes
+// ---- all lanes
+def : InstRW<[Ampere1Write_7cyc_1L_1XY],
+ (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// ---- one lane
+def : InstRW<[Ampere1Write_7cyc_1L_1XY],
+ (instregex "^LD1i(8|16|32|64)")>;
+// -- Load 1-element structure to one/all lanes, 1D size
+def : InstRW<[Ampere1Write_5cyc_1L],
+ (instregex "^LD1Rv1d")>;
+// -- Load 1-element structures to 1 register
+def : InstRW<[Ampere1Write_5cyc_1L],
+ (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 2 registers
+def : InstRW<[Ampere1Write_5cyc_2L],
+ (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 3 registers
+def : InstRW<[Ampere1Write_6cyc_3L],
+ (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 4 registers
+def : InstRW<[Ampere1Write_6cyc_4L],
+ (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to all lanes of 2 registers, 1D size
+def : InstRW<[Ampere1Write_5cyc_2L],
+ (instregex "^LD2Rv1d")>;
+// -- Load 2-element structure to all lanes of 2 registers, other sizes
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+ (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to one lane of 2 registers
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+ (instregex "^LD2i(8|16|32|64)")>;
+// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+ (instregex "^LD2Twov(16b|8h|4s|2d)")>;
+// -- Load 2-element structures to 2 registers, 8B/4H/2S size
+def : InstRW<[Ampere1Write_9cyc_2L_3XY],
+ (instregex "^LD2Twov(8b|4h|2s)")>;
+// -- Load 3-element structure to all lanes of 3 registers, 1D size
+def : InstRW<[Ampere1Write_6cyc_3L],
+ (instregex "^LD3Rv1d")>;
+// -- Load 3-element structure to all lanes of 3 registers, other sizes
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+ (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 3-element structure to one lane of 3 registers
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+ (instregex "^LD3i(8|16|32|64)")>;
+// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1Write_9cyc_3L_3XY],
+ (instregex "^LD3Threev(16b|8h|4s)")>;
+// -- Load 3-element structures to 3 registers, 2D size
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+ (instregex "^LD3Threev2d")>;
+// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_10cyc_3L_3XY],
+ (instregex "^LD3Threev(8b|4h|2s)")>;
+// -- Load 4-element structure to all lanes of 4 registers, 1D size
+def : InstRW<[Ampere1Write_6cyc_4L],
+ (instregex "^LD4Rv1d")>;
+// -- Load 4-element structure to all lanes of 4 registers, other sizes
+def : InstRW<[Ampere1Write_8cyc_4L_4XY],
+ (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 4-element structure to one lane of 4 registers
+def : InstRW<[Ampere1Write_6cyc_4L],
+ (instregex "^LD4i(8|16|32|64)")>;
+// -- Load 4-element structures to 4 registers, 2D size
+def : InstRW<[Ampere1Write_9cyc_4L_4XY],
+ (instregex "^LD4Fourv2d")>;
+// -- Load 4-element structures to 4 registers, 2S size
+def : InstRW<[Ampere1Write_12cyc_4L_8XY],
+ (instregex "^LD4Fourv2s")>;
+// -- Load 4-element structures to 4 registers, other sizes
+def : InstRW<[Ampere1Write_11cyc_4L_8XY],
+ (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>;
+// -- Load pair, Q-form
+def : InstRW<[Ampere1Write_5cyc_2L], (instregex "LDN?PQ")>;
+// -- Load pair, S/D-form
+def : InstRW<[Ampere1Write_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>;
+// -- Load register
+def : InstRW<[Ampere1Write_5cyc_1L], (instregex "LDU?R[BHSDQ]i")>;
+// -- Load register, sign-extended register
+def : InstRW<[Ampere1Write_6cyc_1AB_1L], (instregex "LDR[BHSDQ]ro(W|X)")>;
+
+// FP and vector store instructions
+// -- Store 1-element structure from one lane of 1 register
+def : InstRW<[Ampere1Write_4cyc_1XY_1S_1Z],
+ (instregex "^ST1i(8|16|32|64)")>;
+// -- Store 1-element structures from 1 register
+def : InstRW<[Ampere1Write_2cyc_1S_1Z],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 2 registers
+def : InstRW<[Ampere1Write_3cyc_2S_2Z],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 3 registers
+def : InstRW<[Ampere1Write_4cyc_3S_3Z],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 4 registers
+def : InstRW<[Ampere1Write_5cyc_4S_4Z],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 2-element structure from one lane of 2 registers
+def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
+ (instregex "^ST2i(8|16|32|64)")>;
+// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes
+def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
+ (instregex "^ST2Twov(16b|8h|4s|2d)")>;
+// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_6cyc_2XY_2S_2Z],
+ (instregex "^ST2Twov(8b|4h|2s)")>;
+// -- Store 3-element structure from one lane of 3 registers
+def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
+ (instregex "^ST3i(8|16|32|64)")>;
+// -- Store 3-element structures from 3 registers
+def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
+ (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 4-element structure from one lane of 4 registers
+def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
+ (instregex "^ST4i(8|16|32|64)")>;
+// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1Write_9cyc_8XY_4S_4Z],
+ (instregex "^ST4Fourv(16b|8h|4s)")>;
+// -- Store 4-element structures from 4 registers, 2D sizes
+def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
+ (instregex "^ST4Fourv2d")>;
+// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_9cyc_6XY_4S_4Z],
+ (instregex "^ST4Fourv(8b|4h|2s)")>;
+// -- Store pair, Q-form
+def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^STN?PQ")>;
+// -- Store pair, S/D-form
+def : InstRW<[Ampere1Write_3cyc_1S_2Z], (instregex "^STN?P[SD]")>;
+// -- Store register
+def : InstRW<[Ampere1Write_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>;
+// -- Store register, sign-extended register offset
+def : InstRW<[Ampere1Write_2cyc_1AB_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>;
+
+// FP data processing, bfloat16 format
+def : InstRW<[Ampere1Write_5cyc_1XY], (instrs BFCVT)>;
+def : InstRW<[Ampere1Write_7cyc_2XY], (instrs BFCVTN, BFCVTN2)>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>;
+def : InstRW<[Ampere1Write_4cyc_2XY], (instrs BFMMLA)>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^BFMLAL")>;
+
+// FP data processing, scalar/vector, half precision
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+ (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+ (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+ (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>;
+def : InstRW<[Ampere1Write_4cyc_1X],
+ (instregex "^FCMPE?H")>;
+def : InstRW<[Ampere1Write_10cyc_1A_1BS_1X],
+ (instregex "^FCCMPE?H")>;
+def : InstRW<[Ampere1Write_10cyc_1A_1BS_1XY],
+ (instregex "^FCSELH")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^[SU]CVTFv.[fi]16")>;
+def : InstRW<[Ampere1Write_25cyc_1XY], (instregex "^FDIVv.[if]16", "FDIVH")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>;
+def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>;
+def : InstRW<[Ampere1Write_12cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FMULX?v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instrs FMULX16)>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRECPXv.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>;
+def : InstRW<[Ampere1Write_39cyc_1XY], (instregex "^FSQRTv.f16", "^FSQRTHr")>;
+
+// FP data processing, scalar/vector, single/double precision
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+ (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+ (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+ (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1X],
+ (instregex "^FCMPE?(S|D)")>;
+def : InstRW<[Ampere1Write_11cyc_1A_1BS_1X],
+ (instregex "^FCCMPE?(S|D)")>;
+def : InstRW<[Ampere1Write_11cyc_1A_1BS_1XY],
+ (instregex "^FCSEL(S|D)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^[SU]CVTFv.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_34cyc_1XY], (instregex "^FDIVv.[if](64)", "FDIVD")>;
+def : InstRW<[Ampere1Write_19cyc_1XY], (instregex "^FDIVv.[if](32)", "FDIVS")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_10cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instrs FMULX32, FMULX64)>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT(32|64)")>;
+def : InstRW<[Ampere1Write_62cyc_1XY], (instregex "^FSQRTv.f64", "^FSQRTDr")>;
+def : InstRW<[Ampere1Write_32cyc_1XY], (instregex "^FSQRTv.f32", "^FSQRTSr")>;
+
+// FP miscellaneous instructions
+def : InstRW<[Ampere1Write_10cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FCVT[HSD]Hr")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[HSD][SD]r")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVTLv")>;
+def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^FCVT(N|XN)v")>;
+def : InstRW<[Ampere1Write_10cyc_1X_1Z], (instrs FJCVTZS)>;
+def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>;
+def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>;
+def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "^FMOVXDHighr")>;
+def : InstRW<[Ampere1Write_4cyc_1Z], (instregex "^FMOV[WX][HSD]r")>;
+
+// Integer arithmetic and logical instructions
+def : InstRW<[Ampere1Write_1cyc_1A],
+ (instregex "ADC(W|X)r", "SBC(W|X)r")>;
+def : InstRW<[Ampere1Write_Arith],
+ (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r")>;
+def : InstRW<[Ampere1Write_ArithFlagsetting],
+ (instregex "(ADD|AND|BIC|SUB)S(W|X)r")>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+ (instregex "(ADC|SBC)S(W|X)r")>;
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs RMIF)>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+ (instregex "(CCMN|CCMP)(X|W)")>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+ (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>;
+def : InstRW<[Ampere1Write_18cyc_1BS], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[Ampere1Write_34cyc_1BS], (instrs SDIVXr, UDIVXr)>;
+def : InstRW<[Ampere1Write_3cyc_1BS],
+ (instregex "(S|U)MULHr")>;
+def : InstRW<[Ampere1Write_4cyc_1BS],
+ (instregex "(S|U)?M(ADD|SUB)L?r")>;
+
+// Integer load instructions
+def : InstRW<[Ampere1Write_4cyc_2L],
+ (instregex "(LDNP|LDP|LDPSW)(X|W)")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+ (instregex "LDR(B|D|H|Q|S)ui")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+ (instregex "LDR(D|Q|W|X)l")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+ (instregex "LDTR(B|H|W|X)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+ (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+ (instregex "LDUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+ (instregex "LDURS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1Write_5cyc_1AB_1L],
+ (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1L],
+ (instrs PRFMl, PRFUMi, PRFUMi)>;
+def : InstRW<[Ampere1Write_2cyc_1AB_1L],
+ (instrs PRFMroW, PRFMroX)>;
+
+// Integer miscellaneous instructions
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs ADR, ADRP)>;
+def : InstRW<[Ampere1Write_1cyc_1B], (instregex "EXTR(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1B], (instregex "(S|U)?BFM(W|X)")>;
+def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>;
+def : InstRW<[Ampere1Write_1cyc_1B], (instregex "CLS(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs SETF8, SETF16)>;
+def : InstRW<[Ampere1Write_1cyc_1AB],
+ (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+def : InstRW<[Ampere1Write_1cyc_1B],
+ (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>;
+def : InstRW<[Ampere1Write_1cyc_1B],
+ (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>;
+
+// Integer store instructions
+def : InstRW<[Ampere1Write_1cyc_2S], (instregex "STNP(X|W)i")>;
+def : InstRW<[Ampere1Write_2cyc_1B_1S],
+ (instrs STPWi, STPXi)>;
+def : InstRW<[Ampere1Write_3cyc_1B_1S_1AB],
+ (instregex "STP(W|X)(pre|post)")>;
+def : InstRW<[Ampere1Write_1cyc_1S],
+ (instrs STTRBi, STTRHi, STTRWi, STTRXi)>;
+def : InstRW<[Ampere1Write_1cyc_1S],
+ (instregex "STUR(BB|HH|X|W)i",
+ "STR(X|W)ui",
+ "STUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1Write_1cyc_2S], (instrs STRWroX, STRXroX)>;
+def : InstRW<[Ampere1Write_2cyc_1AB_2S], (instrs STRWroW, STRXroW)>;
+
+// Pointer authentication
+//def : InstRW<[Ampere1Write_7cyc_1BS],
+// (instrs AUTIAZ, AUTIBZ, AUTIASP, AUTIBSP, AUTIA1716, AUTIB1716)>;
+def : InstRW<[Ampere1Write_8cyc_1BS_1A],
+ (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>;
+def : InstRW<[Ampere1Write_8cyc_1BS_2A],
+ (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>;
+//def : InstRW<[Ampere1Write_7cyc_1BS],
+// (instrs PACIAZ, PACIBZ, PACIASP, PACIBSP, PACIA1716, PACIB1716)>;
+def : InstRW<[Ampere1Write_11cyc_1BS_1L], (instregex "^LDRA(A|B)")>;
+def : InstRW<[Ampere1Write_7cyc_1BS], (instrs XPACD, XPACI)>;
+
+// Vector integer instructions
+// -- absolute difference
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv",
+ "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>;
+// -- arithmetic
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD",
+ "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW",
+ "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>;
+// -- arithmetic, horizontal, 16B
+def : InstRW<[Ampere1Write_12cyc_4XY],
+ (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>;
+def : InstRW<[Ampere1Write_12cyc_4XY],
+ (instregex "^[SU](MIN|MAX)Vv16i8v")>;
+// -- arithmetic, horizontal, 4H/4S
+def : InstRW<[Ampere1Write_6cyc_2XY],
+ (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>;
+def : InstRW<[Ampere1Write_6cyc_2XY],
+ (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>;
+// -- arithmetic, horizontal, 8B/8H
+def : InstRW<[Ampere1Write_9cyc_3XY],
+ (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>;
+def : InstRW<[Ampere1Write_9cyc_3XY],
+ (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>;
+// -- arithmetic, narrowing
+def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>;
+// -- arithmetic, pairwise
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>;
+// -- arithmetic, saturating
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>;
+// -- bit count
+def : InstRW<[Ampere1Write_2cyc_1XY],
+ (instregex "^(CLS|CLZ|CNT)v")>;
+// -- compare
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv",
+ "^CMHIv", "^CMHSv")>;
+// -- compare non-zero
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^CMTSTv")>;
+// -- dot product
+def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>;
+// -- fp reciprocal estimate
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPEv", "^FRSQRTEv")>;
+// -- integer reciprocal estimate
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>;
+// -- logical
+def : InstRW<[Ampere1Write_2cyc_1XY],
+ (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
+// -- logical, narrowing
+def : InstRW<[Ampere1Write_5cyc_2XY],
+ (instregex "RSHRNv",
+ "SHRNv", "SQSHRNv", "SQSHRUNv",
+ "UQXTNv")>;
+// -- matrix multiply
+def : InstRW<[Ampere1Write_6cyc_2XY],
+ (instrs SMMLA, UMMLA, USMMLA)>;
+// -- max/min
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
+// -- move immediate
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>;
+// -- multiply
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>;
+// -- multiply accumulate
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>;
+// -- negation, saturating
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>;
+// -- reverse bits/bytes
+def : InstRW<[Ampere1Write_2cyc_1XY],
+ (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>;
+// -- shift
+def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// -- shift and accumulate
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>;
+// -- shift, saturating
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU",
+ "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL",
+ "^UQSHL")>;
+
+// Vector miscellaneous instructions
+// -- duplicate element
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^DUPv.+lane")>;
+// -- duplicate from GPR
+def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^DUPv.+gpr")>;
+// -- extract narrow
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^XTNv")>;
+// -- insert/extract element
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>;
+// -- move FP immediate
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOVv")>;
+// -- move element to GPR
+def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "(S|U)MOVv")>;
+// -- move from GPR to any element
+def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>;
+// -- table lookup
+def : InstRW<[Ampere1Write_2cyc_1XY],
+ (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
+def : InstRW<[Ampere1Write_4cyc_2XY],
+ (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
+def : InstRW<[Ampere1Write_6cyc_3XY],
+ (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
+def : InstRW<[Ampere1Write_8cyc_4XY],
+ (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
+// -- transpose
+def : InstRW<[Ampere1Write_2cyc_1XY],
+ (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
+// -- zip/unzip
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>;
+
+} // SchedModel = Ampere1Model
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td
new file mode 100644
index 0000000..8552c07
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td
@@ -0,0 +1,25 @@
+//===- AArch64SchedPredAmpere.td - AArch64 Sched Preds -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines scheduling predicate definitions that are used by the
+// AArch64 Ampere Computing processors.
+//
+//===----------------------------------------------------------------------===//
+
+// Auxiliary predicates.
+
+// Check for a LSL shift <= 4
+def AmpereCheapLSL : MCSchedPredicate<
+ CheckAny<[CheckShiftBy0,
+ CheckAll<
+ [CheckShiftLSL,
+ CheckAny<
+ [CheckShiftBy1,
+ CheckShiftBy2,
+ CheckShiftBy3,
+ CheckShiftBy4]>]>]>>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
index 5402b8b..4473f3a 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
@@ -53,7 +53,7 @@ let FunctionMapper = "AArch64_AM::getShiftType" in {
}
// Check for shifting in arithmetic and logic instructions.
-foreach I = {0-3, 8} in {
+foreach I = {0-4, 8} in {
let FunctionMapper = "AArch64_AM::getShiftValue" in
def CheckShiftBy#I : CheckImmOperand<3, I>;
}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index b3eb65d..f9b7ca8 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -238,6 +238,12 @@ void AArch64Subtarget::initializeProperties() {
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
break;
+ case Ampere1:
+ CacheLineSize = 64;
+ PrefFunctionLogAlignment = 6;
+ PrefLoopLogAlignment = 6;
+ MaxInterleaveFactor = 4;
+ break;
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index e919263..d7878c4 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -40,6 +40,7 @@ public:
enum ARMProcFamilyEnum : uint8_t {
Others,
A64FX,
+ Ampere1,
AppleA7,
AppleA10,
AppleA11,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a184159..135b94d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -371,6 +371,49 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return Entry->Cost;
break;
}
+ case Intrinsic::fptosi_sat:
+ case Intrinsic::fptoui_sat: {
+ if (ICA.getArgTypes().empty())
+ break;
+ bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
+ auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]);
+ EVT MTy = TLI->getValueType(DL, RetTy);
+ // Check for the legal types, which are where the size of the input and the
+ // output are the same, or we are using cvt f64->i32 or f32->i64.
+ if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
+ LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
+ LT.second == MVT::v2f64) &&
+ (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
+ (LT.second == MVT::f64 && MTy == MVT::i32) ||
+ (LT.second == MVT::f32 && MTy == MVT::i64)))
+ return LT.first;
+ // Similarly for fp16 sizes
+ if (ST->hasFullFP16() &&
+ ((LT.second == MVT::f16 && MTy == MVT::i32) ||
+ ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
+ (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
+ return LT.first;
+
+ // Otherwise we use a legal convert followed by a min+max
+ if ((LT.second.getScalarType() == MVT::f32 ||
+ LT.second.getScalarType() == MVT::f64 ||
+ (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
+ LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
+ Type *LegalTy =
+ Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
+ if (LT.second.isVector())
+ LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
+ InstructionCost Cost = 1;
+ IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
+ LegalTy, {LegalTy, LegalTy});
+ Cost += getIntrinsicInstrCost(Attrs1, CostKind);
+ IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
+ LegalTy, {LegalTy, LegalTy});
+ Cost += getIntrinsicInstrCost(Attrs2, CostKind);
+ return LT.first * Cost;
+ }
+ break;
+ }
default:
break;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 805b6c7..bfe2e9b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -126,7 +126,6 @@ void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
LLT::scalar(64));
const LLT S32 = LLT::scalar(32);
- B.setMBB(*MI.getParent());
B.setInstrAndDebugLoc(MI);
auto Unmerge = B.buildUnmerge(S32, Src);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 76c8edc..a8310c2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -737,7 +737,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Custom);
}
}
@@ -4772,6 +4772,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return lowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::SCALAR_TO_VECTOR:
+ return lowerSCALAR_TO_VECTOR(Op, DAG);
case ISD::BUILD_VECTOR:
return lowerBUILD_VECTOR(Op, DAG);
case ISD::FP_ROUND:
@@ -5768,14 +5770,11 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
EVT EltVT = VecVT.getVectorElementType();
unsigned VecSize = VecVT.getSizeInBits();
unsigned EltSize = EltVT.getSizeInBits();
+ SDLoc SL(Op);
-
- assert(VecSize <= 64);
-
+ // Specially handle the case of v4i16 with static indexing.
unsigned NumElts = VecVT.getVectorNumElements();
- SDLoc SL(Op);
auto KIdx = dyn_cast<ConstantSDNode>(Idx);
-
if (NumElts == 4 && EltSize == 16 && KIdx) {
SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
@@ -5803,35 +5802,41 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
}
+ // Static indexing does not lower to stack access, and hence there is no need
+ // for special custom lowering to avoid stack access.
if (isa<ConstantSDNode>(Idx))
return SDValue();
- MVT IntVT = MVT::getIntegerVT(VecSize);
-
- // Avoid stack access for dynamic indexing.
+ // Avoid stack access for dynamic indexing by custom lowering to
// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
- // Create a congruent vector with the target value in each element so that
- // the required element can be masked and ORed into the target vector.
- SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
- DAG.getSplatBuildVector(VecVT, SL, InsVal));
+ assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
+ MVT IntVT = MVT::getIntegerVT(VecSize);
+
+ // Convert vector index to bit-index and get the required bit mask.
assert(isPowerOf2_32(EltSize));
SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
-
- // Convert vector index to bit-index.
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
-
- SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
DAG.getConstant(0xffff, SL, IntVT),
ScaledIdx);
+ // 1. Create a congruent vector with the target value in each element.
+ SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
+ DAG.getSplatBuildVector(VecVT, SL, InsVal));
+
+ // 2. Mask off all other indicies except the required index within (1).
SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
+
+ // 3. Mask off the required index within the target vector.
+ SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
DAG.getNOT(SL, BFM, IntVT), BCVec);
+ // 4. Get (2) and (3) ORed into the target vector.
SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
+
return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
}
@@ -5954,6 +5959,22 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
}
+SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue SVal = Op.getOperand(0);
+ EVT ResultVT = Op.getValueType();
+ EVT SValVT = SVal.getValueType();
+ SDValue UndefVal = DAG.getUNDEF(SValVT);
+ SDLoc SL(Op);
+
+ SmallVector<SDValue, 8> VElts;
+ VElts.push_back(SVal);
+ for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
+ VElts.push_back(UndefVal);
+
+ return DAG.getBuildVector(ResultVT, SL, VElts);
+}
+
SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc SL(Op);
@@ -10661,39 +10682,64 @@ static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
}
-SDValue SITargetLowering::performAddCombine(SDNode *N,
+// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z).
+SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
DAGCombinerInfo &DCI) const {
+ assert(N->getOpcode() == ISD::ADD);
+
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
SDLoc SL(N);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
- if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
- && Subtarget->hasMad64_32() &&
- !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
- VT.getScalarSizeInBits() <= 64) {
- if (LHS.getOpcode() != ISD::MUL)
- std::swap(LHS, RHS);
+ if (VT.isVector())
+ return SDValue();
- SDValue MulLHS = LHS.getOperand(0);
- SDValue MulRHS = LHS.getOperand(1);
- SDValue AddRHS = RHS;
+ unsigned NumBits = VT.getScalarSizeInBits();
+ if (NumBits <= 32 || NumBits > 64)
+ return SDValue();
- // TODO: Maybe restrict if SGPR inputs.
- if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
- numBitsUnsigned(MulRHS, DAG) <= 32) {
- MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
- MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
- AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
- return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
- }
+ if (LHS.getOpcode() != ISD::MUL) {
+ assert(RHS.getOpcode() == ISD::MUL);
+ std::swap(LHS, RHS);
+ }
+
+ SDValue MulLHS = LHS.getOperand(0);
+ SDValue MulRHS = LHS.getOperand(1);
+ SDValue AddRHS = RHS;
+
+ // TODO: Maybe restrict if SGPR inputs.
+ if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
+ numBitsUnsigned(MulRHS, DAG) <= 32) {
+ MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
+ MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
+ AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
+ return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
+ }
+
+ if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) {
+ MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
+ MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
+ AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
+ return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performAddCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDLoc SL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
- if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) {
- MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
- MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
- AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
- return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
+ if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
+ if (Subtarget->hasMad64_32()) {
+ if (SDValue Folded = tryFoldToMad64_32(N, DCI))
+ return Folded;
}
return SDValue();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 72a00c1..18bb9fb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -151,6 +151,7 @@ private:
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
@@ -197,6 +198,7 @@ private:
SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
+ SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index c2cc302..b2765b2 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -2562,8 +2562,9 @@ void ARMFrameLowering::adjustForSegmentedStacks(
// Make sure the LiveIns are still sorted and unique.
MBB->sortUniqueLiveIns();
// Replace the edges to PrologueMBB by edges to the sequences
- // we are about to add.
- MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]);
+ // we are about to add, but only update for immediate predecessors.
+ if (MBB->isSuccessor(&PrologueMBB))
+ MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]);
}
// The required stack size that is aligned to ARM constant criterion.
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index bbd5f5f..44a323d 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -376,7 +376,8 @@ def ProcessorFeatures {
FeaturePartwordAtomic,
FeatureQuadwordAtomic,
FeaturePredictableSelectIsExpensive,
- FeatureISA2_07
+ FeatureISA2_07,
+ FeatureCRBits
];
list<SubtargetFeature> P8SpecificFeatures = [FeatureAddiLoadFusion,
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 9d8290f..5b28f0d 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -32,7 +32,7 @@ class PassRegistry;
bool lowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
AsmPrinter &AP);
-bool LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
+bool lowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
MCOperand &MCOp, const AsmPrinter &AP);
FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM);
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 7d1ec2a..5b2a247 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -63,7 +63,7 @@ public:
// Wrapper needed for tblgenned pseudo lowering.
bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
- return LowerRISCVMachineOperandToMCOperand(MO, MCOp, *this);
+ return lowerRISCVMachineOperandToMCOperand(MO, MCOp, *this);
}
void emitStartOfAsmFile(Module &M) override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index de9c151..7fae031 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1121,16 +1121,15 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
SDValue V0 = CurDAG->getRegister(RISCV::V0, VT);
// Otherwise use
- // vmslt{u}.vx vd, va, x, v0.t; if mask policy is agnostic.
+ // vmslt{u}.vx vd, va, x, v0.t; vmxor.mm vd, vd, v0
+ // The result is mask undisturbed.
+ // We use the same instructions to emulate mask agnostic behavior, because
+ // the agnostic result can be either undisturbed or all 1.
SDValue Cmp = SDValue(
CurDAG->getMachineNode(VMSLTMaskOpcode, DL, VT,
{MaskedOff, Src1, Src2, V0, VL, SEW, Glue}),
0);
- if (MaskedOff.isUndef()) {
- ReplaceNode(Node, Cmp.getNode());
- return;
- }
- // Need vmxor.mm vd, vd, v0 to assign inactive value.
+ // vmxor.mm vd, vd, v0 is used to update active value.
ReplaceNode(Node, CurDAG->getMachineNode(VMXOROpcode, DL, VT,
{Cmp, Mask, VL, MaskSEW}));
return;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4cb3188..ff63b22 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -6918,7 +6918,10 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
SDValue Overflow;
if (IsAdd && isOneConstant(RHS)) {
// Special case uaddo X, 1 overflowed if the addition result is 0.
- // FIXME: We can do this for any constant RHS by using (X + C) < C.
+ // The general case (X + C) < C is not necessarily beneficial. Although we
+ // reduce the live range of X, we may introduce the materialization of
+ // constant C, especially when the setcc result is used by branch. We have
+ // no compare with constant and branch instructions.
Overflow = DAG.getSetCC(DL, N->getValueType(1), Res,
DAG.getConstant(0, DL, MVT::i64), ISD::SETEQ);
} else {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index f7a1998..3831dc5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -114,11 +114,11 @@ defm : FPFMADynFrmAlias_m<FNMSUB_D, "fnmsub.d", DINX>;
defm : FPFMADynFrmAlias_m<FNMADD_D, "fnmadd.d", DINX>;
let SchedRW = [WriteFALU64, ReadFALU64, ReadFALU64] in {
-defm FADD_D : FPALU_rr_frm_m<0b0000001, "fadd.d", DINX>;
+defm FADD_D : FPALU_rr_frm_m<0b0000001, "fadd.d", DINX, /*Commutable*/1>;
defm FSUB_D : FPALU_rr_frm_m<0b0000101, "fsub.d", DINX>;
}
let SchedRW = [WriteFMul64, ReadFMul64, ReadFMul64] in
-defm FMUL_D : FPALU_rr_frm_m<0b0001001, "fmul.d", DINX>;
+defm FMUL_D : FPALU_rr_frm_m<0b0001001, "fmul.d", DINX, /*Commutable*/1>;
let SchedRW = [WriteFDiv64, ReadFDiv64, ReadFDiv64] in
defm FDIV_D : FPALU_rr_frm_m<0b0001101, "fdiv.d", DINX>;
@@ -140,8 +140,8 @@ defm FSGNJX_D : FPALU_rr_m<0b0010001, 0b010, "fsgnjx.d", DINX>;
}
let SchedRW = [WriteFMinMax64, ReadFMinMax64, ReadFMinMax64] in {
-defm FMIN_D : FPALU_rr_m<0b0010101, 0b000, "fmin.d", DINX>;
-defm FMAX_D : FPALU_rr_m<0b0010101, 0b001, "fmax.d", DINX>;
+defm FMIN_D : FPALU_rr_m<0b0010101, 0b000, "fmin.d", DINX, /*Commutable*/1>;
+defm FMAX_D : FPALU_rr_m<0b0010101, 0b001, "fmax.d", DINX, /*Commutable*/1>;
}
defm FCVT_S_D : FPUnaryOp_r_frm_m<0b0100000, 0b00001, FDINX, "fcvt.s.d">,
@@ -152,7 +152,7 @@ defm FCVT_D_S : FPUnaryOp_r_m<0b0100001, 0b00000, 0b000, DFINX, "fcvt.d.s">,
Sched<[WriteFCvtF32ToF64, ReadFCvtF32ToF64]>;
let SchedRW = [WriteFCmp64, ReadFCmp64, ReadFCmp64] in {
-defm FEQ_D : FPCmp_rr_m<0b1010001, 0b010, "feq.d", DINX>;
+defm FEQ_D : FPCmp_rr_m<0b1010001, 0b010, "feq.d", DINX, /*Commutable*/1>;
defm FLT_D : FPCmp_rr_m<0b1010001, 0b001, "flt.d", DINX>;
defm FLE_D : FPCmp_rr_m<0b1010001, 0b000, "fle.d", DINX>;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index a2cd4a0..b1077ae 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -187,28 +187,32 @@ multiclass FPFMADynFrmAlias_m<FPFMA_rrr_frm Inst, string OpcodeStr,
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in
class FPALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
- DAGOperand rty>
+ DAGOperand rty, bit Commutable>
: RVInstR<funct7, funct3, OPC_OP_FP, (outs rty:$rd),
- (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+ (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2"> {
+ let isCommutable = Commutable;
+}
multiclass FPALU_rr_m<bits<7> funct7, bits<3> funct3, string opcodestr,
- list<ExtInfo_r> Exts> {
+ list<ExtInfo_r> Exts, bit Commutable = 0> {
foreach Ext = Exts in
let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
- def Ext.Suffix : FPALU_rr<funct7, funct3, opcodestr, Ext.Reg>;
+ def Ext.Suffix : FPALU_rr<funct7, funct3, opcodestr, Ext.Reg, Commutable>;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1,
UseNamedOperandTable = 1, hasPostISelHook = 1 in
-class FPALU_rr_frm<bits<7> funct7, string opcodestr, DAGOperand rty>
+class FPALU_rr_frm<bits<7> funct7, string opcodestr, DAGOperand rty,
+ bit Commutable>
: RVInstRFrm<funct7, OPC_OP_FP, (outs rty:$rd),
(ins rty:$rs1, rty:$rs2, frmarg:$frm), opcodestr,
- "$rd, $rs1, $rs2, $frm">;
-
+ "$rd, $rs1, $rs2, $frm"> {
+ let isCommutable = Commutable;
+}
multiclass FPALU_rr_frm_m<bits<7> funct7, string opcodestr,
- list<ExtInfo_r> Exts> {
+ list<ExtInfo_r> Exts, bit Commutable = 0> {
foreach Ext = Exts in
let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
- def Ext.Suffix : FPALU_rr_frm<funct7, opcodestr, Ext.Reg>;
+ def Ext.Suffix : FPALU_rr_frm<funct7, opcodestr, Ext.Reg, Commutable>;
}
class FPALUDynFrmAlias<FPALU_rr_frm Inst, string OpcodeStr,
@@ -269,14 +273,16 @@ multiclass FPUnaryOpDynFrmAlias_m<FPUnaryOp_r_frm Inst, string OpcodeStr,
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in
class FPCmp_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
- DAGOperand rty>
+ DAGOperand rty, bit Commutable>
: RVInstR<funct7, funct3, OPC_OP_FP, (outs GPR:$rd),
- (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+ (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2"> {
+ let isCommutable = Commutable;
+}
multiclass FPCmp_rr_m<bits<7> funct7, bits<3> funct3, string opcodestr,
- list<ExtInfo_r> Exts> {
+ list<ExtInfo_r> Exts, bit Commutable = 0> {
foreach Ext = Exts in
let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
- def Ext.Suffix : FPCmp_rr<funct7, funct3, opcodestr, Ext.Reg>;
+ def Ext.Suffix : FPCmp_rr<funct7, funct3, opcodestr, Ext.Reg, Commutable>;
}
//===----------------------------------------------------------------------===//
@@ -305,11 +311,11 @@ defm : FPFMADynFrmAlias_m<FNMSUB_S, "fnmsub.s", FINX>;
defm : FPFMADynFrmAlias_m<FNMADD_S, "fnmadd.s", FINX>;
let SchedRW = [WriteFALU32, ReadFALU32, ReadFALU32] in {
-defm FADD_S : FPALU_rr_frm_m<0b0000000, "fadd.s", FINX>;
+defm FADD_S : FPALU_rr_frm_m<0b0000000, "fadd.s", FINX, /*Commutable*/1>;
defm FSUB_S : FPALU_rr_frm_m<0b0000100, "fsub.s", FINX>;
}
let SchedRW = [WriteFMul32, ReadFMul32, ReadFMul32] in
-defm FMUL_S : FPALU_rr_frm_m<0b0001000, "fmul.s", FINX>;
+defm FMUL_S : FPALU_rr_frm_m<0b0001000, "fmul.s", FINX, /*Commutable*/1>;
let SchedRW = [WriteFDiv32, ReadFDiv32, ReadFDiv32] in
defm FDIV_S : FPALU_rr_frm_m<0b0001100, "fdiv.s", FINX>;
@@ -331,8 +337,8 @@ defm FSGNJX_S : FPALU_rr_m<0b0010000, 0b010, "fsgnjx.s", FINX>;
}
let SchedRW = [WriteFMinMax32, ReadFMinMax32, ReadFMinMax32] in {
-defm FMIN_S : FPALU_rr_m<0b0010100, 0b000, "fmin.s", FINX>;
-defm FMAX_S : FPALU_rr_m<0b0010100, 0b001, "fmax.s", FINX>;
+defm FMIN_S : FPALU_rr_m<0b0010100, 0b000, "fmin.s", FINX, /*Commutable*/1>;
+defm FMAX_S : FPALU_rr_m<0b0010100, 0b001, "fmax.s", FINX, /*Commutable*/1>;
}
defm FCVT_W_S : FPUnaryOp_r_frm_m<0b1100000, 0b00000, XFINX, "fcvt.w.s">,
@@ -348,7 +354,7 @@ def FMV_X_W : FPUnaryOp_r<0b1110000, 0b00000, 0b000, GPR, FPR32, "fmv.x.w">,
Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]>;
let SchedRW = [WriteFCmp32, ReadFCmp32, ReadFCmp32] in {
-defm FEQ_S : FPCmp_rr_m<0b1010000, 0b010, "feq.s", FINX>;
+defm FEQ_S : FPCmp_rr_m<0b1010000, 0b010, "feq.s", FINX, /*Commutable*/1>;
defm FLT_S : FPCmp_rr_m<0b1010000, 0b001, "flt.s", FINX>;
defm FLE_S : FPCmp_rr_m<0b1010000, 0b000, "fle.s", FINX>;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index edaf158..835a0f5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -109,11 +109,11 @@ defm : FPFMADynFrmAlias_m<FNMSUB_H, "fnmsub.h", HINX>;
defm : FPFMADynFrmAlias_m<FNMADD_H, "fnmadd.h", HINX>;
let SchedRW = [WriteFALU16, ReadFALU16, ReadFALU16] in {
-defm FADD_H : FPALU_rr_frm_m<0b0000010, "fadd.h", HINX>;
+defm FADD_H : FPALU_rr_frm_m<0b0000010, "fadd.h", HINX, /*Commutable*/1>;
defm FSUB_H : FPALU_rr_frm_m<0b0000110, "fsub.h", HINX>;
}
let SchedRW = [WriteFMul16, ReadFMul16, ReadFMul16] in
-defm FMUL_H : FPALU_rr_frm_m<0b0001010, "fmul.h", HINX>;
+defm FMUL_H : FPALU_rr_frm_m<0b0001010, "fmul.h", HINX, /*Commutable*/1>;
let SchedRW = [WriteFDiv16, ReadFDiv16, ReadFDiv16] in
defm FDIV_H : FPALU_rr_frm_m<0b0001110, "fdiv.h", HINX>;
@@ -135,8 +135,8 @@ defm FSGNJX_H : FPALU_rr_m<0b0010010, 0b010, "fsgnjx.h", HINX>;
}
let SchedRW = [WriteFMinMax16, ReadFMinMax16, ReadFMinMax16] in {
-defm FMIN_H : FPALU_rr_m<0b0010110, 0b000, "fmin.h", HINX>;
-defm FMAX_H : FPALU_rr_m<0b0010110, 0b001, "fmax.h", HINX>;
+defm FMIN_H : FPALU_rr_m<0b0010110, 0b000, "fmin.h", HINX, /*Commutable*/1>;
+defm FMAX_H : FPALU_rr_m<0b0010110, 0b001, "fmax.h", HINX, /*Commutable*/1>;
}
defm FCVT_W_H : FPUnaryOp_r_frm_m<0b1100010, 0b00000, XHINX, "fcvt.w.h">,
@@ -173,7 +173,7 @@ def FMV_H_X : FPUnaryOp_r<0b1111010, 0b00000, 0b000, FPR16, GPR, "fmv.h.x">,
} // Predicates = [HasStdExtZfhOrZfhmin]
let SchedRW = [WriteFCmp16, ReadFCmp16, ReadFCmp16] in {
-defm FEQ_H : FPCmp_rr_m<0b1010010, 0b010, "feq.h", HINX>;
+defm FEQ_H : FPCmp_rr_m<0b1010010, 0b010, "feq.h", HINX, /*Commutable*/1>;
defm FLT_H : FPCmp_rr_m<0b1010010, 0b001, "flt.h", HINX>;
defm FLE_H : FPCmp_rr_m<0b1010010, 0b000, "fle.h", HINX>;
}
diff --git a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
index c167c09..4b34bba 100644
--- a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -87,7 +87,7 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
return MCOperand::createExpr(ME);
}
-bool llvm::LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
+bool llvm::lowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
MCOperand &MCOp,
const AsmPrinter &AP) {
switch (MO.getType()) {
@@ -214,7 +214,7 @@ bool llvm::lowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
for (const MachineOperand &MO : MI->operands()) {
MCOperand MCOp;
- if (LowerRISCVMachineOperandToMCOperand(MO, MCOp, AP))
+ if (lowerRISCVMachineOperandToMCOperand(MO, MCOp, AP))
OutMI.addOperand(MCOp);
}
diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 4f37acc..60e1b05 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -1590,9 +1590,11 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
if (getParser().parseExpression(Expr))
return MatchOperand_NoMatch;
- auto isOutOfRangeConstant = [&](const MCExpr *E) -> bool {
+ auto isOutOfRangeConstant = [&](const MCExpr *E, bool Negate) -> bool {
if (auto *CE = dyn_cast<MCConstantExpr>(E)) {
int64_t Value = CE->getValue();
+ if (Negate)
+ Value = -Value;
if ((Value & 1) || Value < MinVal || Value > MaxVal)
return true;
}
@@ -1606,7 +1608,7 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
Error(StartLoc, "Expected PC-relative expression");
return MatchOperand_ParseFail;
}
- if (isOutOfRangeConstant(CE)) {
+ if (isOutOfRangeConstant(CE, false)) {
Error(StartLoc, "offset out of range");
return MatchOperand_ParseFail;
}
@@ -1621,8 +1623,9 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
// For consistency with the GNU assembler, conservatively assume that a
// constant offset must by itself be within the given size range.
if (const auto *BE = dyn_cast<MCBinaryExpr>(Expr))
- if (isOutOfRangeConstant(BE->getLHS()) ||
- isOutOfRangeConstant(BE->getRHS())) {
+ if (isOutOfRangeConstant(BE->getLHS(), false) ||
+ isOutOfRangeConstant(BE->getRHS(),
+ BE->getOpcode() == MCBinaryExpr::Sub)) {
Error(StartLoc, "offset out of range");
return MatchOperand_ParseFail;
}
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index d825981..368b05e 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -48,14 +48,18 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
VEX, T8XD;
// Pseduo instruction for RA.
+ let mayLoad = 1 in
def PLDTILECFGV : PseudoI<(outs), (ins opaquemem:$src),
[(int_x86_ldtilecfg_internal addr:$src)]>;
+ let mayLoad = 1 in
def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
GR16:$src2,
opaquemem:$src3), []>;
+ let mayLoad = 1 in
def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
GR16:$src2,
opaquemem:$src3), []>;
+ let mayStore = 1 in
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
GR16:$src2, opaquemem:$src3,
TILE:$src4), []>;
@@ -67,9 +71,12 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp
+ let mayLoad = 1 in
def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+ let mayLoad = 1 in
def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1,
sibmem:$src2), []>;
+ let mayStore = 1 in
def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>;
def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
[(int_x86_tilezero timm:$src)]>;
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 0ad3e6c..81f258d 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -964,7 +964,7 @@ static void combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
static bool combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
bool Change = false;
for (auto *Cast : Casts) {
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(Cast);
+ auto *II = cast<IntrinsicInst>(Cast);
// %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector(x86_amx %42)
// store <256 x i32> %43, <256 x i32>* %p, align 64
// -->
@@ -984,7 +984,7 @@ static bool combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
Store->eraseFromParent();
} else { // x86_cast_vector_to_tile
SmallVector<Instruction *, 2> DeadLoads;
- LoadInst *Load = dyn_cast<LoadInst>(Cast->getOperand(0));
+ auto *Load = dyn_cast<LoadInst>(Cast->getOperand(0));
if (!Load || !Load->hasOneUse())
continue;
// %65 = load <256 x i32>, <256 x i32>* %p, align 64
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 67ed1d8..05364e3 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -935,8 +935,7 @@ def SKLWriteResGroup58 : SchedWriteRes<[SKLPort23]> {
let ResourceCycles = [1];
}
def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)",
- "MOVZX(16|32|64)rm(8|16)",
- "(V?)MOVDDUPrm")>; // TODO: Should this be SKLWriteResGroup67?
+ "MOVZX(16|32|64)rm(8|16)")>;
def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> {
let Latency = 5;
@@ -993,7 +992,8 @@ def: InstRW<[SKLWriteResGroup67], (instrs VBROADCASTSSrm,
VPBROADCASTDrm,
VPBROADCASTQrm)>;
def: InstRW<[SKLWriteResGroup67], (instregex "(V?)MOVSHDUPrm",
- "(V?)MOVSLDUPrm")>;
+ "(V?)MOVSLDUPrm",
+ "(V?)MOVDDUPrm")>;
def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> {
let Latency = 6;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 0189acd..b682b51 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -1055,8 +1055,7 @@ def SKXWriteResGroup58 : SchedWriteRes<[SKXPort23]> {
let ResourceCycles = [1];
}
def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)",
- "MOVZX(16|32|64)rm(8|16)",
- "(V?)MOVDDUPrm")>; // TODO: Should this be SKXWriteResGroup71?
+ "MOVZX(16|32|64)rm(8|16)")>;
def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> {
let Latency = 5;
@@ -1159,11 +1158,10 @@ def SKXWriteResGroup71 : SchedWriteRes<[SKXPort23]> {
}
def: InstRW<[SKXWriteResGroup71], (instrs VBROADCASTSSrm,
VPBROADCASTDrm,
- VPBROADCASTQrm,
- VMOVSHDUPrm,
- VMOVSLDUPrm,
- MOVSHDUPrm,
- MOVSLDUPrm)>;
+ VPBROADCASTQrm)>;
+def: InstRW<[SKXWriteResGroup71], (instregex "(V?)MOVSHDUPrm",
+ "(V?)MOVSLDUPrm",
+ "(V?)MOVDDUPrm")>;
def SKXWriteResGroup72 : SchedWriteRes<[SKXPort5]> {
let Latency = 6;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td
index 2103169..5051d4c 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -485,12 +485,6 @@ defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 100>;
def Zn2WriteMicrocoded : SchedWriteRes<[]> {
let Latency = 100;
}
-defm : Zn2WriteResPair<WriteDPPS, [], 15>;
-defm : Zn2WriteResPair<WriteFHAdd, [], 7>;
-defm : Zn2WriteResPair<WriteFHAddY, [], 7>;
-defm : Zn2WriteResPair<WritePHAdd, [], 3>;
-defm : Zn2WriteResPair<WritePHAddX, [], 3>;
-defm : Zn2WriteResPair<WritePHAddY, [], 3>;
def : SchedAlias<WriteMicrocoded, Zn2WriteMicrocoded>;
def : SchedAlias<WriteFCMOV, Zn2WriteMicrocoded>;
@@ -1108,6 +1102,14 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
//-- Arithmetic instructions --//
+// HADD, HSUB PS/PD
+// PHADD|PHSUB (S) W/D.
+defm : Zn2WriteResPair<WriteFHAdd, [], 7>;
+defm : Zn2WriteResPair<WriteFHAddY, [], 7>;
+defm : Zn2WriteResPair<WritePHAdd, [], 3>;
+defm : Zn2WriteResPair<WritePHAddX, [], 3>;
+defm : Zn2WriteResPair<WritePHAddY, [], 3>;
+
// PCMPGTQ.
def Zn2WritePCMPGTQr : SchedWriteRes<[Zn2FPU03]>;
def : InstRW<[Zn2WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
@@ -1478,6 +1480,7 @@ def : SchedAlias<WriteFDiv64YLd, Zn2WriteVDIVPDYLd>;
// DPPS.
// x,x,i / v,v,v,i.
+defm : Zn2WriteResPair<WriteDPPS, [], 15>;
def : SchedAlias<WriteDPPSY, Zn2WriteMicrocoded>;
// x,m,i / v,v,m,i.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 7f4ca3a..a0bde8dc 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3833,10 +3833,21 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
}
}
- // TODO: Use default extraction for now, but we should investigate extending this
- // to handle repeated subvector extraction.
- if (Extract)
+ if (Extract) {
+ // vXi1 can be efficiently extracted with MOVMSK.
+ // TODO: AVX512 predicate mask handling.
+ // NOTE: This doesn't work well for roundtrip scalarization.
+ if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
+ unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+ unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
+ unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
+ return MOVMSKCost;
+ }
+
+ // TODO: Use default extraction for now, but we should investigate extending
+ // this to handle repeated subvector extraction.
Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
+ }
return Cost;
}
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 65bc392..a33cb0b 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -218,10 +218,17 @@ static Function *doPromotion(
LLVM_DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n"
<< "From: " << *F);
+ uint64_t LargestVectorWidth = 0;
+ for (auto *I : Params)
+ if (auto *VT = dyn_cast<llvm::VectorType>(I))
+ LargestVectorWidth = std::max(
+ LargestVectorWidth, VT->getPrimitiveSizeInBits().getKnownMinSize());
+
// Recompute the parameter attributes list based on the new arguments for
// the function.
NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttrs(),
PAL.getRetAttrs(), ArgAttrVec));
+ AttributeFuncs::updateMinLegalVectorWidthAttr(*NF, LargestVectorWidth);
ArgAttrVec.clear();
F->getParent()->getFunctionList().insert(F->getIterator(), NF);
@@ -313,6 +320,9 @@ static Function *doPromotion(
Args.clear();
ArgAttrVec.clear();
+ AttributeFuncs::updateMinLegalVectorWidthAttr(*CB.getCaller(),
+ LargestVectorWidth);
+
// Update the callgraph to know that the callsite has been transformed.
if (ReplaceCallSite)
(*ReplaceCallSite)(CB, *NewCS);
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 247da10..40dd6ee 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -2486,6 +2486,12 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
}
}
+ uint64_t LargestVectorWidth = 0;
+ for (auto *I : NewArgumentTypes)
+ if (auto *VT = dyn_cast<llvm::VectorType>(I))
+ LargestVectorWidth = std::max(
+ LargestVectorWidth, VT->getPrimitiveSizeInBits().getKnownMinSize());
+
FunctionType *OldFnTy = OldFn->getFunctionType();
Type *RetTy = OldFnTy->getReturnType();
@@ -2515,6 +2521,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
NewFn->setAttributes(AttributeList::get(
Ctx, OldFnAttributeList.getFnAttrs(), OldFnAttributeList.getRetAttrs(),
NewArgumentAttributes));
+ AttributeFuncs::updateMinLegalVectorWidthAttr(*NewFn, LargestVectorWidth);
// Since we have now created the new function, splice the body of the old
// function right into the new function, leaving the old rotting hulk of the
@@ -2592,6 +2599,9 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
Ctx, OldCallAttributeList.getFnAttrs(),
OldCallAttributeList.getRetAttrs(), NewArgOperandAttributes));
+ AttributeFuncs::updateMinLegalVectorWidthAttr(*NewCB->getCaller(),
+ LargestVectorWidth);
+
CallSitePairs.push_back({OldCB, NewCB});
return true;
};
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 72b94cd..157cb27e 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/iterator_range.h"
@@ -904,7 +905,7 @@ OptimizeGlobalAddressOfAllocation(GlobalVariable *GV, CallInst *CI,
}
}
- SmallPtrSet<Constant *, 1> RepValues;
+ SmallSetVector<Constant *, 1> RepValues;
RepValues.insert(NewGV);
// If there is a comparison against null, we will insert a global bool to
diff --git a/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
index 6ec3c61..76f8f1a 100644
--- a/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -29,7 +29,7 @@ static bool inferAllPrototypeAttributes(
// explicitly visited by CGSCC passes in the new pass manager.)
if (F.isDeclaration() && !F.hasOptNone()) {
if (!F.hasFnAttribute(Attribute::NoBuiltin))
- Changed |= inferLibFuncAttributes(F, GetTLI(F));
+ Changed |= inferNonMandatoryLibFuncAttrs(F, GetTLI(F));
Changed |= inferAttributesFromOthers(F);
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 4a62afc..39a32e5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -759,7 +759,7 @@ getAsConstantIndexedAddress(Type *ElemTy, Value *V, const DataLayout &DL) {
V = GEP->getOperand(0);
Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1));
Index = ConstantExpr::getAdd(
- Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType));
+ Index, ConstantExpr::getSExtOrTrunc(GEPIndex, IndexType));
continue;
}
break;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index ab35698..b044b8a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1298,6 +1298,8 @@ static Instruction *foldFDivPowDivisor(BinaryOperator &I,
}
Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
+ Module *M = I.getModule();
+
if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1),
I.getFastMathFlags(),
SQ.getWithInstruction(&I)))
@@ -1363,8 +1365,8 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
!IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) &&
match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X)));
- if ((IsTan || IsCot) &&
- hasFloatFn(&TLI, I.getType(), LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) {
+ if ((IsTan || IsCot) && hasFloatFn(M, &TLI, I.getType(), LibFunc_tan,
+ LibFunc_tanf, LibFunc_tanl)) {
IRBuilder<> B(&I);
IRBuilder<>::FastMathFlagGuard FMFGuard(B);
B.setFastMathFlags(I.getFastMathFlags());
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 0e65a44..780a446 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -32,6 +32,7 @@
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -408,6 +409,25 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
if (Access.Addr->isSwiftError())
return None;
+ // Peel off GEPs and BitCasts.
+ auto *Addr = Access.Addr->stripInBoundsOffsets();
+
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+ // Do not instrument PGO counter updates.
+ if (GV->hasSection()) {
+ StringRef SectionName = GV->getSection();
+ // Check if the global is in the PGO counters section.
+ auto OF = Triple(I->getModule()->getTargetTriple()).getObjectFormat();
+ if (SectionName.endswith(
+ getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false)))
+ return None;
+ }
+
+ // Do not instrument accesses to LLVM internal variables.
+ if (GV->getName().startswith("__llvm"))
+ return None;
+ }
+
const DataLayout &DL = I->getModule()->getDataLayout();
Access.TypeSize = DL.getTypeStoreSizeInBits(Access.AccessTy);
return Access;
@@ -613,8 +633,6 @@ bool MemProfiler::instrumentFunction(Function &F) {
initializeCallbacks(*F.getParent());
- FunctionModified |= insertDynamicShadowAtFunctionEntry(F);
-
SmallVector<Instruction *, 16> ToInstrument;
// Fill the set of memory operations to instrument.
@@ -625,6 +643,15 @@ bool MemProfiler::instrumentFunction(Function &F) {
}
}
+ if (ToInstrument.empty()) {
+ LLVM_DEBUG(dbgs() << "MEMPROF done instrumenting: " << FunctionModified
+ << " " << F << "\n");
+
+ return FunctionModified;
+ }
+
+ FunctionModified |= insertDynamicShadowAtFunctionEntry(F);
+
int NumInstrumented = 0;
for (auto *Inst : ToInstrument) {
if (ClDebugMin < 0 || ClDebugMax < 0 ||
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index 5338032..2610ef1 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -774,12 +774,9 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
unsigned NumOrigPreds = Preds.size();
// We can only sink instructions through unconditional branches.
- for (auto I = Preds.begin(); I != Preds.end();) {
- if ((*I)->getTerminator()->getNumSuccessors() != 1)
- I = Preds.erase(I);
- else
- ++I;
- }
+ llvm::erase_if(Preds, [](BasicBlock *BB) {
+ return BB->getTerminator()->getNumSuccessors() != 1;
+ });
LockstepReverseIterator LRI(Preds);
SmallVector<SinkingInstructionCandidate, 4> Candidates;
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 11c756c..87202f7 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1100,6 +1100,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
Value *StoredVal, Instruction *TheStore,
SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
+ Module *M = TheStore->getModule();
Value *SplatValue = isBytewiseValue(StoredVal, *DL);
Constant *PatternValue = nullptr;
@@ -1182,15 +1183,14 @@ bool LoopIdiomRecognize::processLoopStridedStore(
NewCall = Builder.CreateMemSet(
BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
/*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
- } else {
+ } else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
// Everything is emitted in default address space
Type *Int8PtrTy = DestInt8PtrTy;
- Module *M = TheStore->getModule();
StringRef FuncName = "memset_pattern16";
- FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
- Int8PtrTy, Int8PtrTy, IntIdxTy);
- inferLibFuncAttributes(M, FuncName, *TLI);
+ FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
+ Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
+ inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
// Otherwise we should form a memset_pattern16. PatternValue is known to be
// an constant array of 16-bytes. Plop the value into a mergable global.
@@ -1201,7 +1201,9 @@ bool LoopIdiomRecognize::processLoopStridedStore(
GV->setAlignment(Align(16));
Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
- }
+ } else
+ return Changed;
+
NewCall->setDebugLoc(TheStore->getDebugLoc());
if (MSSAU) {
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index cff8f51..344f89e 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -575,9 +575,11 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
if (OpI->getType()->isVectorTy()) {
Scattered[I] = scatter(&CI, OpI);
assert(Scattered[I].size() == NumElems && "mismatched call operands");
+ if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
+ Tys.push_back(OpI->getType()->getScalarType());
} else {
ScalarOperands[I] = OpI;
- if (hasVectorIntrinsicOverloadedScalarOpd(ID, I))
+ if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
Tys.push_back(OpI->getType());
}
}
@@ -593,7 +595,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
ScalarCallOps.clear();
for (unsigned J = 0; J != NumArgs; ++J) {
- if (hasVectorIntrinsicScalarOpd(ID, J))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, J))
ScalarCallOps.push_back(ScalarOperands[J]);
else
ScalarCallOps.push_back(Scattered[J][Elem]);
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index b3f1229..14c1fed 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -204,13 +204,19 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
/// branch on a single value.
static void buildPartialUnswitchConditionalBranch(
BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction,
- BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze) {
+ BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze,
+ Instruction *I, AssumptionCache *AC, DominatorTree &DT) {
IRBuilder<> IRB(&BB);
- Value *Cond = Direction ? IRB.CreateOr(Invariants) :
- IRB.CreateAnd(Invariants);
- if (InsertFreeze)
- Cond = IRB.CreateFreeze(Cond, Cond->getName() + ".fr");
+ SmallVector<Value *> FrozenInvariants;
+ for (Value *Inv : Invariants) {
+ if (InsertFreeze && !isGuaranteedNotToBeUndefOrPoison(Inv, AC, I, &DT))
+ Inv = IRB.CreateFreeze(Inv, Inv->getName() + ".fr");
+ FrozenInvariants.push_back(Inv);
+ }
+
+ Value *Cond = Direction ? IRB.CreateOr(FrozenInvariants)
+ : IRB.CreateAnd(FrozenInvariants);
IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
Direction ? &NormalSucc : &UnswitchedSucc);
}
@@ -572,10 +578,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
" condition!");
buildPartialUnswitchConditionalBranch(
*OldPH, Invariants, ExitDirection, *UnswitchedBB, *NewPH,
- FreezeLoopUnswitchCond && any_of(Invariants, [&](Value *C) {
- return !isGuaranteedNotToBeUndefOrPoison(C, nullptr,
- OldPH->getTerminator(), &DT);
- }));
+ FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT);
}
// Update the dominator tree with the added edge.
@@ -2318,11 +2321,9 @@ static void unswitchNontrivialInvariants(
buildPartialInvariantUnswitchConditionalBranch(
*SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU);
else {
- buildPartialUnswitchConditionalBranch(
- *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH,
- InsertFreeze && any_of(Invariants, [&](Value *C) {
- return !isGuaranteedNotToBeUndefOrPoison(C, &AC, BI, &DT);
- }));
+ buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
+ *ClonedPH, *LoopPH, InsertFreeze,
+ BI, &AC, DT);
}
DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 1f4f1c9..40fd407 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -39,7 +39,6 @@ STATISTIC(NumInaccessibleMemOrArgMemOnly,
STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind");
STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture");
STATISTIC(NumWriteOnlyArg, "Number of arguments inferred as writeonly");
-STATISTIC(NumExtArg, "Number of arguments inferred as signext/zeroext.");
STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly");
STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
STATISTIC(NumNoUndef, "Number of function returns inferred as noundef returns");
@@ -147,16 +146,6 @@ static bool setOnlyWritesMemory(Function &F, unsigned ArgNo) {
return true;
}
-static bool setArgExtAttr(Function &F, unsigned ArgNo,
- const TargetLibraryInfo &TLI, bool Signed = true) {
- Attribute::AttrKind ExtAttr = TLI.getExtAttrForI32Param(Signed);
- if (ExtAttr == Attribute::None || F.hasParamAttribute(ArgNo, ExtAttr))
- return false;
- F.addParamAttr(ArgNo, ExtAttr);
- ++NumExtArg;
- return true;
-}
-
static bool setRetNoUndef(Function &F) {
if (!F.getReturnType()->isVoidTy() &&
!F.hasRetAttribute(Attribute::NoUndef)) {
@@ -231,6 +220,13 @@ static bool setAlignedAllocParam(Function &F, unsigned ArgNo) {
return true;
}
+static bool setAllocatedPointerParam(Function &F, unsigned ArgNo) {
+ if (F.hasParamAttribute(ArgNo, Attribute::AllocatedPointer))
+ return false;
+ F.addParamAttr(ArgNo, Attribute::AllocatedPointer);
+ return true;
+}
+
static bool setAllocSize(Function &F, unsigned ElemSizeArg,
Optional<unsigned> NumElemsArg) {
if (F.hasFnAttribute(Attribute::AllocSize))
@@ -240,15 +236,23 @@ static bool setAllocSize(Function &F, unsigned ElemSizeArg,
return true;
}
-bool llvm::inferLibFuncAttributes(Module *M, StringRef Name,
- const TargetLibraryInfo &TLI) {
+static bool setAllocFamily(Function &F, StringRef Family) {
+ if (F.hasFnAttribute("alloc-family"))
+ return false;
+ F.addFnAttr("alloc-family", Family);
+ return true;
+}
+
+bool llvm::inferNonMandatoryLibFuncAttrs(Module *M, StringRef Name,
+ const TargetLibraryInfo &TLI) {
Function *F = M->getFunction(Name);
if (!F)
return false;
- return inferLibFuncAttributes(*F, TLI);
+ return inferNonMandatoryLibFuncAttrs(*F, TLI);
}
-bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
+bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
+ const TargetLibraryInfo &TLI) {
LibFunc TheLibFunc;
if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
return false;
@@ -376,6 +380,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setArgNoUndef(F, 1);
LLVM_FALLTHROUGH;
case LibFunc_strdup:
+ Changed |= setAllocFamily(F, "malloc");
Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
Changed |= setDoesNotThrow(F);
Changed |= setRetDoesNotAlias(F);
@@ -437,7 +442,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
LLVM_FALLTHROUGH;
case LibFunc_valloc:
case LibFunc_malloc:
+ Changed |= setAllocFamily(F, "malloc");
+ LLVM_FALLTHROUGH;
case LibFunc_vec_malloc:
+ Changed |= setAllocFamily(F, "vec_malloc");
Changed |= setAllocSize(F, 0, None);
Changed |= setOnlyAccessesInaccessibleMemory(F);
Changed |= setRetAndArgsNoUndef(F);
@@ -501,6 +509,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setOnlyReadsMemory(F, 1);
return Changed;
case LibFunc_memalign:
+ Changed |= setAllocFamily(F, "malloc");
Changed |= setAllocSize(F, 1, None);
Changed |= setAlignedAllocParam(F, 0);
Changed |= setOnlyAccessesInaccessibleMemory(F);
@@ -522,8 +531,12 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setDoesNotCapture(F, 0);
return Changed;
case LibFunc_realloc:
- case LibFunc_vec_realloc:
case LibFunc_reallocf:
+ Changed |= setAllocFamily(F, "malloc");
+ LLVM_FALLTHROUGH;
+ case LibFunc_vec_realloc:
+ Changed |= setAllocFamily(F, "vec_malloc");
+ Changed |= setAllocatedPointerParam(F, 0);
Changed |= setAllocSize(F, 1, None);
Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
Changed |= setRetNoUndef(F);
@@ -597,7 +610,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setOnlyWritesMemory(F, 0);
return Changed;
case LibFunc_calloc:
+ Changed |= setAllocFamily(F, "malloc");
+ LLVM_FALLTHROUGH;
case LibFunc_vec_calloc:
+ Changed |= setAllocFamily(F, "vec_malloc");
Changed |= setAllocSize(F, 0, 1);
Changed |= setOnlyAccessesInaccessibleMemory(F);
Changed |= setRetAndArgsNoUndef(F);
@@ -656,7 +672,11 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setDoesNotCapture(F, 0);
return Changed;
case LibFunc_free:
+ Changed |= setAllocFamily(F, "malloc");
+ LLVM_FALLTHROUGH;
case LibFunc_vec_free:
+ Changed |= setAllocFamily(F, "vec_malloc");
+ Changed |= setAllocatedPointerParam(F, 0);
Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
Changed |= setArgsNoUndef(F);
Changed |= setDoesNotThrow(F);
@@ -845,7 +865,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
case LibFunc_putchar:
case LibFunc_putchar_unlocked:
Changed |= setRetAndArgsNoUndef(F);
- Changed |= setArgExtAttr(F, 0, TLI);
Changed |= setDoesNotThrow(F);
return Changed;
case LibFunc_popen:
@@ -1066,7 +1085,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
case LibFunc_ldexp:
case LibFunc_ldexpf:
case LibFunc_ldexpl:
- Changed |= setArgExtAttr(F, 1, TLI);
Changed |= setWillReturn(F);
return Changed;
case LibFunc_abs:
@@ -1203,34 +1221,141 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
}
}
-bool llvm::hasFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+static void setArgExtAttr(Function &F, unsigned ArgNo,
+ const TargetLibraryInfo &TLI, bool Signed = true) {
+ Attribute::AttrKind ExtAttr = TLI.getExtAttrForI32Param(Signed);
+ if (ExtAttr != Attribute::None && !F.hasParamAttribute(ArgNo, ExtAttr))
+ F.addParamAttr(ArgNo, ExtAttr);
+}
+
+FunctionCallee llvm::getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI,
+ LibFunc TheLibFunc, FunctionType *T,
+ AttributeList AttributeList) {
+ assert(TLI.has(TheLibFunc) &&
+ "Creating call to non-existing library function.");
+ StringRef Name = TLI.getName(TheLibFunc);
+ FunctionCallee C = M->getOrInsertFunction(Name, T, AttributeList);
+
+ // Make sure any mandatory argument attributes are added.
+
+ // Any outgoing i32 argument should be handled with setArgExtAttr() which
+ // will add an extension attribute if the target ABI requires it. Adding
+ // argument extensions is typically done by the front end but when an
+ // optimizer is building a library call on its own it has to take care of
+ // this. Each such generated function must be handled here with sign or
+ // zero extensions as needed. F is retreived with cast<> because we demand
+ // of the caller to have called isLibFuncEmittable() first.
+ Function *F = cast<Function>(C.getCallee());
+ assert(F->getFunctionType() == T && "Function type does not match.");
+ switch (TheLibFunc) {
+ case LibFunc_fputc:
+ case LibFunc_putchar:
+ setArgExtAttr(*F, 0, TLI);
+ break;
+ case LibFunc_ldexp:
+ case LibFunc_ldexpf:
+ case LibFunc_ldexpl:
+ case LibFunc_memchr:
+ case LibFunc_strchr:
+ setArgExtAttr(*F, 1, TLI);
+ break;
+ case LibFunc_memccpy:
+ setArgExtAttr(*F, 2, TLI);
+ break;
+
+ // These are functions that are known to not need any argument extension
+ // on any target: A size_t argument (which may be an i32 on some targets)
+ // should not trigger the assert below.
+ case LibFunc_bcmp:
+ case LibFunc_calloc:
+ case LibFunc_fwrite:
+ case LibFunc_malloc:
+ case LibFunc_memcmp:
+ case LibFunc_memcpy_chk:
+ case LibFunc_mempcpy:
+ case LibFunc_memset_pattern16:
+ case LibFunc_snprintf:
+ case LibFunc_stpncpy:
+ case LibFunc_strlcat:
+ case LibFunc_strlcpy:
+ case LibFunc_strncat:
+ case LibFunc_strncmp:
+ case LibFunc_strncpy:
+ case LibFunc_vsnprintf:
+ break;
+
+ default:
+#ifndef NDEBUG
+ for (unsigned i = 0; i < T->getNumParams(); i++)
+ assert(!isa<IntegerType>(T->getParamType(i)) &&
+ "Unhandled integer argument.");
+#endif
+ break;
+ }
+
+ return C;
+}
+
+FunctionCallee llvm::getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI,
+ LibFunc TheLibFunc, FunctionType *T) {
+ return getOrInsertLibFunc(M, TLI, TheLibFunc, T, AttributeList());
+}
+
+bool llvm::isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI,
+ LibFunc TheLibFunc) {
+ StringRef FuncName = TLI->getName(TheLibFunc);
+ if (!TLI->has(TheLibFunc))
+ return false;
+
+ // Check if the Module already has a GlobalValue with the same name, in
+ // which case it must be a Function with the expected type.
+ if (GlobalValue *GV = M->getNamedValue(FuncName)) {
+ if (auto *F = dyn_cast<Function>(GV))
+ return TLI->isValidProtoForLibFunc(*F->getFunctionType(), TheLibFunc, *M);
+ return false;
+ }
+
+ return true;
+}
+
+bool llvm::isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI,
+ StringRef Name) {
+ LibFunc TheLibFunc;
+ return TLI->getLibFunc(Name, TheLibFunc) &&
+ isLibFuncEmittable(M, TLI, TheLibFunc);
+}
+
+bool llvm::hasFloatFn(const Module *M, const TargetLibraryInfo *TLI, Type *Ty,
LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn) {
switch (Ty->getTypeID()) {
case Type::HalfTyID:
return false;
case Type::FloatTyID:
- return TLI->has(FloatFn);
+ return isLibFuncEmittable(M, TLI, FloatFn);
case Type::DoubleTyID:
- return TLI->has(DoubleFn);
+ return isLibFuncEmittable(M, TLI, DoubleFn);
default:
- return TLI->has(LongDoubleFn);
+ return isLibFuncEmittable(M, TLI, LongDoubleFn);
}
}
-StringRef llvm::getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty,
- LibFunc DoubleFn, LibFunc FloatFn,
- LibFunc LongDoubleFn) {
- assert(hasFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
+StringRef llvm::getFloatFn(const Module *M, const TargetLibraryInfo *TLI,
+ Type *Ty, LibFunc DoubleFn, LibFunc FloatFn,
+ LibFunc LongDoubleFn, LibFunc &TheLibFunc) {
+ assert(hasFloatFn(M, TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
"Cannot get name for unavailable function!");
switch (Ty->getTypeID()) {
case Type::HalfTyID:
llvm_unreachable("No name for HalfTy!");
case Type::FloatTyID:
+ TheLibFunc = FloatFn;
return TLI->getName(FloatFn);
case Type::DoubleTyID:
+ TheLibFunc = DoubleFn;
return TLI->getName(DoubleFn);
default:
+ TheLibFunc = LongDoubleFn;
return TLI->getName(LongDoubleFn);
}
}
@@ -1247,14 +1372,14 @@ static Value *emitLibCall(LibFunc TheLibFunc, Type *ReturnType,
ArrayRef<Value *> Operands, IRBuilderBase &B,
const TargetLibraryInfo *TLI,
bool IsVaArgs = false) {
- if (!TLI->has(TheLibFunc))
+ Module *M = B.GetInsertBlock()->getModule();
+ if (!isLibFuncEmittable(M, TLI, TheLibFunc))
return nullptr;
- Module *M = B.GetInsertBlock()->getModule();
StringRef FuncName = TLI->getName(TheLibFunc);
FunctionType *FuncType = FunctionType::get(ReturnType, ParamTypes, IsVaArgs);
- FunctionCallee Callee = M->getOrInsertFunction(FuncName, FuncType);
- inferLibFuncAttributes(M, FuncName, *TLI);
+ FunctionCallee Callee = getOrInsertLibFunc(M, *TLI, TheLibFunc, FuncType);
+ inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
CallInst *CI = B.CreateCall(Callee, Operands, FuncName);
if (const Function *F =
dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
@@ -1323,16 +1448,16 @@ Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
IRBuilderBase &B, const DataLayout &DL,
const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_memcpy_chk))
+ Module *M = B.GetInsertBlock()->getModule();
+ if (!isLibFuncEmittable(M, TLI, LibFunc_memcpy_chk))
return nullptr;
- Module *M = B.GetInsertBlock()->getModule();
AttributeList AS;
AS = AttributeList::get(M->getContext(), AttributeList::FunctionIndex,
Attribute::NoUnwind);
LLVMContext &Context = B.GetInsertBlock()->getContext();
- FunctionCallee MemCpy = M->getOrInsertFunction(
- "__memcpy_chk", AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(),
+ FunctionCallee MemCpy = getOrInsertLibFunc(M, *TLI, LibFunc_memcpy_chk,
+ AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(),
B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context),
DL.getIntPtrType(Context));
Dst = castToCStr(Dst, B);
@@ -1466,14 +1591,15 @@ static void appendTypeSuffix(Value *Op, StringRef &Name,
}
}
-static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
- IRBuilderBase &B,
- const AttributeList &Attrs) {
+static Value *emitUnaryFloatFnCallHelper(Value *Op, LibFunc TheLibFunc,
+ StringRef Name, IRBuilderBase &B,
+ const AttributeList &Attrs,
+ const TargetLibraryInfo *TLI) {
assert((Name != "") && "Must specify Name to emitUnaryFloatFnCall");
Module *M = B.GetInsertBlock()->getModule();
- FunctionCallee Callee =
- M->getOrInsertFunction(Name, Op->getType(), Op->getType());
+ FunctionCallee Callee = getOrInsertLibFunc(M, *TLI, TheLibFunc, Op->getType(),
+ Op->getType());
CallInst *CI = B.CreateCall(Callee, Op, Name);
// The incoming attribute set may have come from a speculatable intrinsic, but
@@ -1488,12 +1614,16 @@ static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
return CI;
}
-Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilderBase &B,
+Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
+ StringRef Name, IRBuilderBase &B,
const AttributeList &Attrs) {
SmallString<20> NameBuffer;
appendTypeSuffix(Op, Name, NameBuffer);
- return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+ LibFunc TheLibFunc;
+ TLI->getLibFunc(Name, TheLibFunc);
+
+ return emitUnaryFloatFnCallHelper(Op, TheLibFunc, Name, B, Attrs, TLI);
}
Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
@@ -1501,23 +1631,25 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
LibFunc LongDoubleFn, IRBuilderBase &B,
const AttributeList &Attrs) {
// Get the name of the function according to TLI.
- StringRef Name = getFloatFnName(TLI, Op->getType(),
- DoubleFn, FloatFn, LongDoubleFn);
+ Module *M = B.GetInsertBlock()->getModule();
+ LibFunc TheLibFunc;
+ StringRef Name = getFloatFn(M, TLI, Op->getType(), DoubleFn, FloatFn,
+ LongDoubleFn, TheLibFunc);
- return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+ return emitUnaryFloatFnCallHelper(Op, TheLibFunc, Name, B, Attrs, TLI);
}
static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
+ LibFunc TheLibFunc,
StringRef Name, IRBuilderBase &B,
const AttributeList &Attrs,
- const TargetLibraryInfo *TLI = nullptr) {
+ const TargetLibraryInfo *TLI) {
assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
Module *M = B.GetInsertBlock()->getModule();
- FunctionCallee Callee = M->getOrInsertFunction(Name, Op1->getType(),
- Op1->getType(), Op2->getType());
- if (TLI != nullptr)
- inferLibFuncAttributes(M, Name, *TLI);
+ FunctionCallee Callee = getOrInsertLibFunc(M, *TLI, TheLibFunc, Op1->getType(),
+ Op1->getType(), Op2->getType());
+ inferNonMandatoryLibFuncAttrs(M, Name, *TLI);
CallInst *CI = B.CreateCall(Callee, { Op1, Op2 }, Name);
// The incoming attribute set may have come from a speculatable intrinsic, but
@@ -1532,15 +1664,19 @@ static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
return CI;
}
-Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
- IRBuilderBase &B,
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
+ const TargetLibraryInfo *TLI,
+ StringRef Name, IRBuilderBase &B,
const AttributeList &Attrs) {
assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
SmallString<20> NameBuffer;
appendTypeSuffix(Op1, Name, NameBuffer);
- return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs);
+ LibFunc TheLibFunc;
+ TLI->getLibFunc(Name, TheLibFunc);
+
+ return emitBinaryFloatFnCallHelper(Op1, Op2, TheLibFunc, Name, B, Attrs, TLI);
}
Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
@@ -1549,22 +1685,24 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
LibFunc LongDoubleFn, IRBuilderBase &B,
const AttributeList &Attrs) {
// Get the name of the function according to TLI.
- StringRef Name = getFloatFnName(TLI, Op1->getType(),
- DoubleFn, FloatFn, LongDoubleFn);
+ Module *M = B.GetInsertBlock()->getModule();
+ LibFunc TheLibFunc;
+ StringRef Name = getFloatFn(M, TLI, Op1->getType(), DoubleFn, FloatFn,
+ LongDoubleFn, TheLibFunc);
- return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs, TLI);
+ return emitBinaryFloatFnCallHelper(Op1, Op2, TheLibFunc, Name, B, Attrs, TLI);
}
Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B,
const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_putchar))
+ Module *M = B.GetInsertBlock()->getModule();
+ if (!isLibFuncEmittable(M, TLI, LibFunc_putchar))
return nullptr;
- Module *M = B.GetInsertBlock()->getModule();
StringRef PutCharName = TLI->getName(LibFunc_putchar);
- FunctionCallee PutChar =
- M->getOrInsertFunction(PutCharName, B.getInt32Ty(), B.getInt32Ty());
- inferLibFuncAttributes(M, PutCharName, *TLI);
+ FunctionCallee PutChar = getOrInsertLibFunc(M, *TLI, LibFunc_putchar,
+ B.getInt32Ty(), B.getInt32Ty());
+ inferNonMandatoryLibFuncAttrs(M, PutCharName, *TLI);
CallInst *CI = B.CreateCall(PutChar,
B.CreateIntCast(Char,
B.getInt32Ty(),
@@ -1580,14 +1718,14 @@ Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B,
Value *llvm::emitPutS(Value *Str, IRBuilderBase &B,
const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_puts))
+ Module *M = B.GetInsertBlock()->getModule();
+ if (!isLibFuncEmittable(M, TLI, LibFunc_puts))
return nullptr;
- Module *M = B.GetInsertBlock()->getModule();
StringRef PutsName = TLI->getName(LibFunc_puts);
- FunctionCallee PutS =
- M->getOrInsertFunction(PutsName, B.getInt32Ty(), B.getInt8PtrTy());
- inferLibFuncAttributes(M, PutsName, *TLI);
+ FunctionCallee PutS = getOrInsertLibFunc(M, *TLI, LibFunc_puts, B.getInt32Ty(),
+ B.getInt8PtrTy());
+ inferNonMandatoryLibFuncAttrs(M, PutsName, *TLI);
CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), PutsName);
if (const Function *F =
dyn_cast<Function>(PutS.getCallee()->stripPointerCasts()))
@@ -1597,15 +1735,15 @@ Value *llvm::emitPutS(Value *Str, IRBuilderBase &B,
Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B,
const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_fputc))
+ Module *M = B.GetInsertBlock()->getModule();
+ if (!isLibFuncEmittable(M, TLI, LibFunc_fputc))
return nullptr;
- Module *M = B.GetInsertBlock()->getModule();
StringRef FPutcName = TLI->getName(LibFunc_fputc);
- FunctionCallee F = M->getOrInsertFunction(FPutcName, B.getInt32Ty(),
- B.getInt32Ty(), File->getType());
+ FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fputc, B.getInt32Ty(),
+ B.getInt32Ty(), File->getType());
if (File->getType()->isPointerTy())
- inferLibFuncAttributes(M, FPutcName, *TLI);
+ inferNonMandatoryLibFuncAttrs(M, FPutcName, *TLI);
Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
"chari");
CallInst *CI = B.CreateCall(F, {Char, File}, FPutcName);
@@ -1618,15 +1756,15 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B,
Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B,
const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_fputs))
+ Module *M = B.GetInsertBlock()->getModule();
+ if (!isLibFuncEmittable(M, TLI, LibFunc_fputs))
return nullptr;
- Module *M = B.GetInsertBlock()->getModule();
StringRef FPutsName = TLI->getName(LibFunc_fputs);
- FunctionCallee F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(),
- B.getInt8PtrTy(), File->getType());
+ FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fputs, B.getInt32Ty(),
+ B.getInt8PtrTy(), File->getType());
if (File->getType()->isPointerTy())
- inferLibFuncAttributes(M, FPutsName, *TLI);
+ inferNonMandatoryLibFuncAttrs(M, FPutsName, *TLI);
CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsName);
if (const Function *Fn =
@@ -1637,18 +1775,18 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B,
Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B,
const DataLayout &DL, const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_fwrite))
+ Module *M = B.GetInsertBlock()->getModule();
+ if (!isLibFuncEmittable(M, TLI, LibFunc_fwrite))
return nullptr;
- Module *M = B.GetInsertBlock()->getModule();
LLVMContext &Context = B.GetInsertBlock()->getContext();
StringRef FWriteName = TLI->getName(LibFunc_fwrite);
- FunctionCallee F = M->getOrInsertFunction(
- FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
- DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+ FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fwrite,
+ DL.getIntPtrType(Context), B.getInt8PtrTy(), DL.getIntPtrType(Context),
+ DL.getIntPtrType(Context), File->getType());
if (File->getType()->isPointerTy())
- inferLibFuncAttributes(M, FWriteName, *TLI);
+ inferNonMandatoryLibFuncAttrs(M, FWriteName, *TLI);
CallInst *CI =
B.CreateCall(F, {castToCStr(Ptr, B), Size,
ConstantInt::get(DL.getIntPtrType(Context), 1), File});
@@ -1661,15 +1799,15 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B,
Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL,
const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_malloc))
+ Module *M = B.GetInsertBlock()->getModule();
+ if (!isLibFuncEmittable(M, TLI, LibFunc_malloc))
return nullptr;
- Module *M = B.GetInsertBlock()->getModule();
StringRef MallocName = TLI->getName(LibFunc_malloc);
LLVMContext &Context = B.GetInsertBlock()->getContext();
- FunctionCallee Malloc = M->getOrInsertFunction(MallocName, B.getInt8PtrTy(),
- DL.getIntPtrType(Context));
- inferLibFuncAttributes(M, MallocName, *TLI);
+ FunctionCallee Malloc = getOrInsertLibFunc(M, *TLI, LibFunc_malloc,
+ B.getInt8PtrTy(), DL.getIntPtrType(Context));
+ inferNonMandatoryLibFuncAttrs(M, MallocName, *TLI);
CallInst *CI = B.CreateCall(Malloc, Num, MallocName);
if (const Function *F =
@@ -1681,16 +1819,16 @@ Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL,
Value *llvm::emitCalloc(Value *Num, Value *Size, IRBuilderBase &B,
const TargetLibraryInfo &TLI) {
- if (!TLI.has(LibFunc_calloc))
+ Module *M = B.GetInsertBlock()->getModule();
+ if (!isLibFuncEmittable(M, &TLI, LibFunc_calloc))
return nullptr;
- Module *M = B.GetInsertBlock()->getModule();
StringRef CallocName = TLI.getName(LibFunc_calloc);
const DataLayout &DL = M->getDataLayout();
IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
- FunctionCallee Calloc =
- M->getOrInsertFunction(CallocName, B.getInt8PtrTy(), PtrType, PtrType);
- inferLibFuncAttributes(M, CallocName, TLI);
+ FunctionCallee Calloc = getOrInsertLibFunc(M, TLI, LibFunc_calloc,
+ B.getInt8PtrTy(), PtrType, PtrType);
+ inferNonMandatoryLibFuncAttrs(M, CallocName, TLI);
CallInst *CI = B.CreateCall(Calloc, {Num, Size}, CallocName);
if (const auto *F =
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 7a9a272..e72e3ce 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -500,6 +500,13 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
if (isMathLibCallNoop(Call, TLI))
return true;
+ // Non-volatile atomic loads from constants can be removed.
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ if (auto *GV = dyn_cast<GlobalVariable>(
+ LI->getPointerOperand()->stripPointerCasts()))
+ if (!LI->isVolatile() && GV->isConstant())
+ return true;
+
return false;
}
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 38dca39..0710511 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1190,13 +1190,15 @@ Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI,
}
Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) {
+ Module *M = CI->getModule();
if (Value *V = optimizeMemCmpBCmpCommon(CI, B))
return V;
// memcmp(x, y, Len) == 0 -> bcmp(x, y, Len) == 0
// bcmp can be more efficient than memcmp because it only has to know that
// there is a difference, not how different one is to the other.
- if (TLI->has(LibFunc_bcmp) && isOnlyUsedInZeroEqualityComparison(CI)) {
+ if (isLibFuncEmittable(M, TLI, LibFunc_bcmp) &&
+ isOnlyUsedInZeroEqualityComparison(CI)) {
Value *LHS = CI->getArgOperand(0);
Value *RHS = CI->getArgOperand(1);
Value *Size = CI->getArgOperand(2);
@@ -1360,7 +1362,8 @@ static Value *valueHasFloatPrecision(Value *Val) {
/// Shrink double -> float functions.
static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B,
- bool isBinary, bool isPrecise = false) {
+ bool isBinary, const TargetLibraryInfo *TLI,
+ bool isPrecise = false) {
Function *CalleeFn = CI->getCalledFunction();
if (!CI->getType()->isDoubleTy() || !CalleeFn)
return nullptr;
@@ -1410,22 +1413,25 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B,
R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]);
} else {
AttributeList CalleeAttrs = CalleeFn->getAttributes();
- R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeName, B, CalleeAttrs)
- : emitUnaryFloatFnCall(V[0], CalleeName, B, CalleeAttrs);
+ R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], TLI, CalleeName, B,
+ CalleeAttrs)
+ : emitUnaryFloatFnCall(V[0], TLI, CalleeName, B, CalleeAttrs);
}
return B.CreateFPExt(R, B.getDoubleTy());
}
/// Shrink double -> float for unary functions.
static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI,
bool isPrecise = false) {
- return optimizeDoubleFP(CI, B, false, isPrecise);
+ return optimizeDoubleFP(CI, B, false, TLI, isPrecise);
}
/// Shrink double -> float for binary functions.
static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI,
bool isPrecise = false) {
- return optimizeDoubleFP(CI, B, true, isPrecise);
+ return optimizeDoubleFP(CI, B, true, TLI, isPrecise);
}
// cabs(z) -> sqrt((creal(z)*creal(z)) + (cimag(z)*cimag(z)))
@@ -1541,6 +1547,7 @@ static Value *getIntToFPVal(Value *I2F, IRBuilderBase &B, unsigned DstWidth) {
/// ldexp(1.0, x) for pow(2.0, itofp(x)); exp2(n * x) for pow(2.0 ** n, x);
/// exp10(x) for pow(10.0, x); exp2(log2(n) * x) for pow(n, x).
Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
+ Module *M = Pow->getModule();
Value *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1);
AttributeList Attrs; // Attributes are only meaningful on the original call
Module *Mod = Pow->getModule();
@@ -1568,7 +1575,8 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
Function *CalleeFn = BaseFn->getCalledFunction();
if (CalleeFn &&
- TLI->getLibFunc(CalleeFn->getName(), LibFn) && TLI->has(LibFn)) {
+ TLI->getLibFunc(CalleeFn->getName(), LibFn) &&
+ isLibFuncEmittable(M, TLI, LibFn)) {
StringRef ExpName;
Intrinsic::ID ID;
Value *ExpFn;
@@ -1620,7 +1628,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
// pow(2.0, itofp(x)) -> ldexp(1.0, x)
if (match(Base, m_SpecificFP(2.0)) &&
(isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) &&
- hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+ hasFloatFn(M, TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize()))
return copyFlags(*Pow,
emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI,
@@ -1629,7 +1637,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
}
// pow(2.0 ** n, x) -> exp2(n * x)
- if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) {
+ if (hasFloatFn(M, TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) {
APFloat BaseR = APFloat(1.0);
BaseR.convert(BaseF->getSemantics(), APFloat::rmTowardZero, &Ignored);
BaseR = BaseR / *BaseF;
@@ -1656,7 +1664,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
// pow(10.0, x) -> exp10(x)
// TODO: There is no exp10() intrinsic yet, but some day there shall be one.
if (match(Base, m_SpecificFP(10.0)) &&
- hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
+ hasFloatFn(M, TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
return copyFlags(*Pow, emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10,
LibFunc_exp10f, LibFunc_exp10l,
B, Attrs));
@@ -1681,7 +1689,8 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
return copyFlags(*Pow, B.CreateCall(Intrinsic::getDeclaration(
Mod, Intrinsic::exp2, Ty),
FMul, "exp2"));
- else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l))
+ else if (hasFloatFn(M, TLI, Ty, LibFunc_exp2, LibFunc_exp2f,
+ LibFunc_exp2l))
return copyFlags(*Pow, emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2,
LibFunc_exp2f,
LibFunc_exp2l, B, Attrs));
@@ -1702,7 +1711,8 @@ static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno,
}
// Otherwise, use the libcall for sqrt().
- if (hasFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf, LibFunc_sqrtl))
+ if (hasFloatFn(M, TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf,
+ LibFunc_sqrtl))
// TODO: We also should check that the target can in fact lower the sqrt()
// libcall. We currently have no way to ask this question, so we ask if
// the target has a sqrt() libcall, which is not exactly the same.
@@ -1892,8 +1902,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
// Shrink pow() to powf() if the arguments are single precision,
// unless the result is expected to be double precision.
if (UnsafeFPShrink && Name == TLI->getName(LibFunc_pow) &&
- hasFloatVersion(Name)) {
- if (Value *Shrunk = optimizeBinaryDoubleFP(Pow, B, true))
+ hasFloatVersion(M, Name)) {
+ if (Value *Shrunk = optimizeBinaryDoubleFP(Pow, B, TLI, true))
return Shrunk;
}
@@ -1901,13 +1911,14 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
}
Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
+ Module *M = CI->getModule();
Function *Callee = CI->getCalledFunction();
AttributeList Attrs; // Attributes are only meaningful on the original call
StringRef Name = Callee->getName();
Value *Ret = nullptr;
if (UnsafeFPShrink && Name == TLI->getName(LibFunc_exp2) &&
- hasFloatVersion(Name))
- Ret = optimizeUnaryDoubleFP(CI, B, true);
+ hasFloatVersion(M, Name))
+ Ret = optimizeUnaryDoubleFP(CI, B, TLI, true);
Type *Ty = CI->getType();
Value *Op = CI->getArgOperand(0);
@@ -1915,7 +1926,7 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
// Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x)) if sizeof(x) <= IntSize
// Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x)) if sizeof(x) < IntSize
if ((isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op)) &&
- hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+ hasFloatFn(M, TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
if (Value *Exp = getIntToFPVal(Op, B, TLI->getIntSize()))
return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), Exp, TLI,
LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
@@ -1926,12 +1937,14 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
}
Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) {
+ Module *M = CI->getModule();
+
// If we can shrink the call to a float function rather than a double
// function, do that first.
Function *Callee = CI->getCalledFunction();
StringRef Name = Callee->getName();
- if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name))
- if (Value *Ret = optimizeBinaryDoubleFP(CI, B))
+ if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(M, Name))
+ if (Value *Ret = optimizeBinaryDoubleFP(CI, B, TLI))
return Ret;
// The LLVM intrinsics minnum/maxnum correspond to fmin/fmax. Canonicalize to
@@ -1962,8 +1975,8 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
Type *Ty = Log->getType();
Value *Ret = nullptr;
- if (UnsafeFPShrink && hasFloatVersion(LogNm))
- Ret = optimizeUnaryDoubleFP(Log, B, true);
+ if (UnsafeFPShrink && hasFloatVersion(Mod, LogNm))
+ Ret = optimizeUnaryDoubleFP(Log, B, TLI, true);
// The earlier call must also be 'fast' in order to do these transforms.
CallInst *Arg = dyn_cast<CallInst>(Log->getArgOperand(0));
@@ -2071,7 +2084,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
Log->doesNotAccessMemory()
? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
Arg->getOperand(0), "log")
- : emitUnaryFloatFnCall(Arg->getOperand(0), LogNm, B, Attrs);
+ : emitUnaryFloatFnCall(Arg->getOperand(0), TLI, LogNm, B, Attrs);
Value *MulY = B.CreateFMul(Arg->getArgOperand(1), LogX, "mul");
// Since pow() may have side effects, e.g. errno,
// dead code elimination may not be trusted to remove it.
@@ -2094,7 +2107,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
Value *LogE = Log->doesNotAccessMemory()
? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
Eul, "log")
- : emitUnaryFloatFnCall(Eul, LogNm, B, Attrs);
+ : emitUnaryFloatFnCall(Eul, TLI, LogNm, B, Attrs);
Value *MulY = B.CreateFMul(Arg->getArgOperand(0), LogE, "mul");
// Since exp() may have side effects, e.g. errno,
// dead code elimination may not be trusted to remove it.
@@ -2106,14 +2119,16 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
}
Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
+ Module *M = CI->getModule();
Function *Callee = CI->getCalledFunction();
Value *Ret = nullptr;
// TODO: Once we have a way (other than checking for the existince of the
// libcall) to tell whether our target can lower @llvm.sqrt, relax the
// condition below.
- if (TLI->has(LibFunc_sqrtf) && (Callee->getName() == "sqrt" ||
- Callee->getIntrinsicID() == Intrinsic::sqrt))
- Ret = optimizeUnaryDoubleFP(CI, B, true);
+ if (isLibFuncEmittable(M, TLI, LibFunc_sqrtf) &&
+ (Callee->getName() == "sqrt" ||
+ Callee->getIntrinsicID() == Intrinsic::sqrt))
+ Ret = optimizeUnaryDoubleFP(CI, B, TLI, true);
if (!CI->isFast())
return Ret;
@@ -2158,7 +2173,6 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
// If we found a repeated factor, hoist it out of the square root and
// replace it with the fabs of that factor.
- Module *M = Callee->getParent();
Type *ArgType = I->getType();
Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType);
Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs");
@@ -2175,11 +2189,12 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
// TODO: Generalize to handle any trig function and its inverse.
Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) {
+ Module *M = CI->getModule();
Function *Callee = CI->getCalledFunction();
Value *Ret = nullptr;
StringRef Name = Callee->getName();
- if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name))
- Ret = optimizeUnaryDoubleFP(CI, B, true);
+ if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(M, Name))
+ Ret = optimizeUnaryDoubleFP(CI, B, TLI, true);
Value *Op1 = CI->getArgOperand(0);
auto *OpC = dyn_cast<CallInst>(Op1);
@@ -2195,7 +2210,8 @@ Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) {
// tanl(atanl(x)) -> x
LibFunc Func;
Function *F = OpC->getCalledFunction();
- if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
+ if (F && TLI->getLibFunc(F->getName(), Func) &&
+ isLibFuncEmittable(M, TLI, Func) &&
((Func == LibFunc_atan && Callee->getName() == "tan") ||
(Func == LibFunc_atanf && Callee->getName() == "tanf") ||
(Func == LibFunc_atanl && Callee->getName() == "tanl")))
@@ -2211,9 +2227,10 @@ static bool isTrigLibCall(CallInst *CI) {
CI->hasFnAttr(Attribute::ReadNone);
}
-static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
+static bool insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
bool UseFloat, Value *&Sin, Value *&Cos,
- Value *&SinCos) {
+ Value *&SinCos, const TargetLibraryInfo *TLI) {
+ Module *M = OrigCallee->getParent();
Type *ArgTy = Arg->getType();
Type *ResTy;
StringRef Name;
@@ -2233,9 +2250,12 @@ static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
ResTy = StructType::get(ArgTy, ArgTy);
}
- Module *M = OrigCallee->getParent();
- FunctionCallee Callee =
- M->getOrInsertFunction(Name, OrigCallee->getAttributes(), ResTy, ArgTy);
+ if (!isLibFuncEmittable(M, TLI, Name))
+ return false;
+ LibFunc TheLibFunc;
+ TLI->getLibFunc(Name, TheLibFunc);
+ FunctionCallee Callee = getOrInsertLibFunc(
+ M, *TLI, TheLibFunc, OrigCallee->getAttributes(), ResTy, ArgTy);
if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
// If the argument is an instruction, it must dominate all uses so put our
@@ -2259,6 +2279,8 @@ static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
"cospi");
}
+
+ return true;
}
Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) {
@@ -2286,7 +2308,9 @@ Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) {
return nullptr;
Value *Sin, *Cos, *SinCos;
- insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, SinCos);
+ if (!insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos,
+ SinCos, TLI))
+ return nullptr;
auto replaceTrigInsts = [this](SmallVectorImpl<CallInst *> &Calls,
Value *Res) {
@@ -2307,6 +2331,7 @@ void LibCallSimplifier::classifyArgUse(
SmallVectorImpl<CallInst *> &CosCalls,
SmallVectorImpl<CallInst *> &SinCosCalls) {
CallInst *CI = dyn_cast<CallInst>(Val);
+ Module *M = CI->getModule();
if (!CI || CI->use_empty())
return;
@@ -2317,7 +2342,8 @@ void LibCallSimplifier::classifyArgUse(
Function *Callee = CI->getCalledFunction();
LibFunc Func;
- if (!Callee || !TLI->getLibFunc(*Callee, Func) || !TLI->has(Func) ||
+ if (!Callee || !TLI->getLibFunc(*Callee, Func) ||
+ !isLibFuncEmittable(M, TLI, Func) ||
!isTrigLibCall(CI))
return;
@@ -2532,6 +2558,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) {
+ Module *M = CI->getModule();
Function *Callee = CI->getCalledFunction();
FunctionType *FT = Callee->getFunctionType();
if (Value *V = optimizePrintFString(CI, B)) {
@@ -2540,10 +2567,10 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) {
// printf(format, ...) -> iprintf(format, ...) if no floating point
// arguments.
- if (TLI->has(LibFunc_iprintf) && !callHasFloatingPointArgument(CI)) {
- Module *M = B.GetInsertBlock()->getParent()->getParent();
- FunctionCallee IPrintFFn =
- M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
+ if (isLibFuncEmittable(M, TLI, LibFunc_iprintf) &&
+ !callHasFloatingPointArgument(CI)) {
+ FunctionCallee IPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_iprintf, FT,
+ Callee->getAttributes());
CallInst *New = cast<CallInst>(CI->clone());
New->setCalledFunction(IPrintFFn);
B.Insert(New);
@@ -2552,11 +2579,10 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) {
// printf(format, ...) -> __small_printf(format, ...) if no 128-bit floating point
// arguments.
- if (TLI->has(LibFunc_small_printf) && !callHasFP128Argument(CI)) {
- Module *M = B.GetInsertBlock()->getParent()->getParent();
- auto SmallPrintFFn =
- M->getOrInsertFunction(TLI->getName(LibFunc_small_printf),
- FT, Callee->getAttributes());
+ if (isLibFuncEmittable(M, TLI, LibFunc_small_printf) &&
+ !callHasFP128Argument(CI)) {
+ auto SmallPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_small_printf, FT,
+ Callee->getAttributes());
CallInst *New = cast<CallInst>(CI->clone());
New->setCalledFunction(SmallPrintFFn);
B.Insert(New);
@@ -2655,6 +2681,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
}
Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) {
+ Module *M = CI->getModule();
Function *Callee = CI->getCalledFunction();
FunctionType *FT = Callee->getFunctionType();
if (Value *V = optimizeSPrintFString(CI, B)) {
@@ -2663,10 +2690,10 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) {
// sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
// point arguments.
- if (TLI->has(LibFunc_siprintf) && !callHasFloatingPointArgument(CI)) {
- Module *M = B.GetInsertBlock()->getParent()->getParent();
- FunctionCallee SIPrintFFn =
- M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
+ if (isLibFuncEmittable(M, TLI, LibFunc_siprintf) &&
+ !callHasFloatingPointArgument(CI)) {
+ FunctionCallee SIPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_siprintf,
+ FT, Callee->getAttributes());
CallInst *New = cast<CallInst>(CI->clone());
New->setCalledFunction(SIPrintFFn);
B.Insert(New);
@@ -2675,11 +2702,10 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) {
// sprintf(str, format, ...) -> __small_sprintf(str, format, ...) if no 128-bit
// floating point arguments.
- if (TLI->has(LibFunc_small_sprintf) && !callHasFP128Argument(CI)) {
- Module *M = B.GetInsertBlock()->getParent()->getParent();
- auto SmallSPrintFFn =
- M->getOrInsertFunction(TLI->getName(LibFunc_small_sprintf),
- FT, Callee->getAttributes());
+ if (isLibFuncEmittable(M, TLI, LibFunc_small_sprintf) &&
+ !callHasFP128Argument(CI)) {
+ auto SmallSPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_small_sprintf, FT,
+ Callee->getAttributes());
CallInst *New = cast<CallInst>(CI->clone());
New->setCalledFunction(SmallSPrintFFn);
B.Insert(New);
@@ -2835,6 +2861,7 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
}
Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) {
+ Module *M = CI->getModule();
Function *Callee = CI->getCalledFunction();
FunctionType *FT = Callee->getFunctionType();
if (Value *V = optimizeFPrintFString(CI, B)) {
@@ -2843,10 +2870,10 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) {
// fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
// floating point arguments.
- if (TLI->has(LibFunc_fiprintf) && !callHasFloatingPointArgument(CI)) {
- Module *M = B.GetInsertBlock()->getParent()->getParent();
- FunctionCallee FIPrintFFn =
- M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
+ if (isLibFuncEmittable(M, TLI, LibFunc_fiprintf) &&
+ !callHasFloatingPointArgument(CI)) {
+ FunctionCallee FIPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_fiprintf,
+ FT, Callee->getAttributes());
CallInst *New = cast<CallInst>(CI->clone());
New->setCalledFunction(FIPrintFFn);
B.Insert(New);
@@ -2855,11 +2882,11 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) {
// fprintf(stream, format, ...) -> __small_fprintf(stream, format, ...) if no
// 128-bit floating point arguments.
- if (TLI->has(LibFunc_small_fprintf) && !callHasFP128Argument(CI)) {
- Module *M = B.GetInsertBlock()->getParent()->getParent();
+ if (isLibFuncEmittable(M, TLI, LibFunc_small_fprintf) &&
+ !callHasFP128Argument(CI)) {
auto SmallFPrintFFn =
- M->getOrInsertFunction(TLI->getName(LibFunc_small_fprintf),
- FT, Callee->getAttributes());
+ getOrInsertLibFunc(M, *TLI, LibFunc_small_fprintf, FT,
+ Callee->getAttributes());
CallInst *New = cast<CallInst>(CI->clone());
New->setCalledFunction(SmallFPrintFFn);
B.Insert(New);
@@ -2944,21 +2971,19 @@ Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) {
CI->getArgOperand(2)));
}
-bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
- LibFunc Func;
+bool LibCallSimplifier::hasFloatVersion(const Module *M, StringRef FuncName) {
SmallString<20> FloatFuncName = FuncName;
FloatFuncName += 'f';
- if (TLI->getLibFunc(FloatFuncName, Func))
- return TLI->has(Func);
- return false;
+ return isLibFuncEmittable(M, TLI, FloatFuncName);
}
Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
IRBuilderBase &Builder) {
+ Module *M = CI->getModule();
LibFunc Func;
Function *Callee = CI->getCalledFunction();
// Check for string/memory library functions.
- if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
+ if (TLI->getLibFunc(*Callee, Func) && isLibFuncEmittable(M, TLI, Func)) {
// Make sure we never change the calling convention.
assert(
(ignoreCallingConv(Func) ||
@@ -3039,6 +3064,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
LibFunc Func,
IRBuilderBase &Builder) {
+ const Module *M = CI->getModule();
+
// Don't optimize calls that require strict floating point semantics.
if (CI->isStrictFP())
return nullptr;
@@ -3117,12 +3144,12 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
case LibFunc_sin:
case LibFunc_sinh:
case LibFunc_tanh:
- if (UnsafeFPShrink && hasFloatVersion(CI->getCalledFunction()->getName()))
- return optimizeUnaryDoubleFP(CI, Builder, true);
+ if (UnsafeFPShrink && hasFloatVersion(M, CI->getCalledFunction()->getName()))
+ return optimizeUnaryDoubleFP(CI, Builder, TLI, true);
return nullptr;
case LibFunc_copysign:
- if (hasFloatVersion(CI->getCalledFunction()->getName()))
- return optimizeBinaryDoubleFP(CI, Builder);
+ if (hasFloatVersion(M, CI->getCalledFunction()->getName()))
+ return optimizeBinaryDoubleFP(CI, Builder, TLI);
return nullptr;
case LibFunc_fminf:
case LibFunc_fmin:
@@ -3141,6 +3168,7 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
}
Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
+ Module *M = CI->getModule();
assert(!CI->isMustTailCall() && "These transforms aren't musttail safe.");
// TODO: Split out the code below that operates on FP calls so that
@@ -3219,7 +3247,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
}
// Then check for known library functions.
- if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
+ if (TLI->getLibFunc(*Callee, Func) && isLibFuncEmittable(M, TLI, Func)) {
// We never change the calling convention.
if (!ignoreCallingConv(Func) && !IsCallingConvC)
return nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 5ecee44..d3a944c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -441,6 +441,26 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
return false;
}
+/// Returns true if A and B have same pointer operands or same SCEVs addresses
+static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
+ StoreInst *B) {
+ // Compare store
+ if (A == B)
+ return true;
+
+ // Otherwise Compare pointers
+ Value *APtr = A->getPointerOperand();
+ Value *BPtr = B->getPointerOperand();
+ if (APtr == BPtr)
+ return true;
+
+ // Otherwise compare address SCEVs
+ if (SE->getSCEV(APtr) == SE->getSCEV(BPtr))
+ return true;
+
+ return false;
+}
+
int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
Value *Ptr) const {
const ValueToValueMap &Strides =
@@ -678,7 +698,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
RecurrenceDescriptor RedDes;
if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
- DT)) {
+ DT, PSE.getSE())) {
Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
AllowedExit.insert(RedDes.getLoopExitInstr());
Reductions[Phi] = RedDes;
@@ -772,7 +792,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
auto *SE = PSE.getSE();
Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
for (unsigned i = 0, e = CI->arg_size(); i != e; ++i)
- if (hasVectorIntrinsicScalarOpd(IntrinID, i)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, i)) {
if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
reportVectorizationFailure("Found unvectorizable intrinsic",
"intrinsic instruction cannot be vectorized",
@@ -913,11 +933,66 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
if (!LAI->canVectorizeMemory())
return false;
- if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
- reportVectorizationFailure("Stores to a uniform address",
- "write to a loop invariant address could not be vectorized",
- "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
- return false;
+ // We can vectorize stores to invariant address when final reduction value is
+ // guaranteed to be stored at the end of the loop. Also, if decision to
+ // vectorize loop is made, runtime checks are added so as to make sure that
+ // invariant address won't alias with any other objects.
+ if (!LAI->getStoresToInvariantAddresses().empty()) {
+ // For each invariant address, check its last stored value is unconditional.
+ for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+ if (isInvariantStoreOfReduction(SI) &&
+ blockNeedsPredication(SI->getParent())) {
+ reportVectorizationFailure(
+ "We don't allow storing to uniform addresses",
+ "write of conditional recurring variant value to a loop "
+ "invariant address could not be vectorized",
+ "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+ return false;
+ }
+ }
+
+ if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
+ // For each invariant address, check its last stored value is the result
+ // of one of our reductions.
+ //
+ // We do not check if dependence with loads exists because they are
+ // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this
+ // behaviour changes we have to modify this code.
+ ScalarEvolution *SE = PSE.getSE();
+ SmallVector<StoreInst *, 4> UnhandledStores;
+ for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+ if (isInvariantStoreOfReduction(SI)) {
+ // Earlier stores to this address are effectively deadcode.
+ // With opaque pointers it is possible for one pointer to be used with
+ // different sizes of stored values:
+ // store i32 0, ptr %x
+ // store i8 0, ptr %x
+ // The latest store doesn't complitely overwrite the first one in the
+ // example. That is why we have to make sure that types of stored
+ // values are same.
+ // TODO: Check that bitwidth of unhandled store is smaller then the
+ // one that overwrites it and add a test.
+ erase_if(UnhandledStores, [SE, SI](StoreInst *I) {
+ return storeToSameAddress(SE, SI, I) &&
+ I->getValueOperand()->getType() ==
+ SI->getValueOperand()->getType();
+ });
+ continue;
+ }
+ UnhandledStores.push_back(SI);
+ }
+
+ bool IsOK = UnhandledStores.empty();
+ // TODO: we should also validate against InvariantMemSets.
+ if (!IsOK) {
+ reportVectorizationFailure(
+ "We don't allow storing to uniform addresses",
+ "write to a loop invariant address could not "
+ "be vectorized",
+ "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+ return false;
+ }
+ }
}
Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
@@ -944,13 +1019,34 @@ bool LoopVectorizationLegality::canVectorizeFPMath(
// We can now only vectorize if all reductions with Exact FP math also
// have the isOrdered flag set, which indicates that we can move the
- // reduction operations in-loop.
+ // reduction operations in-loop, and do not have intermediate store.
return (all_of(getReductionVars(), [&](auto &Reduction) -> bool {
const RecurrenceDescriptor &RdxDesc = Reduction.second;
- return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered();
+ return !RdxDesc.hasExactFPMath() ||
+ (RdxDesc.isOrdered() && !RdxDesc.IntermediateStore);
}));
}
+bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
+ return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ return RdxDesc.IntermediateStore == SI;
+ });
+}
+
+bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
+ return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ if (!RdxDesc.IntermediateStore)
+ return false;
+
+ ScalarEvolution *SE = PSE.getSE();
+ Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
+ return V == InvariantAddress ||
+ SE->getSCEV(V) == SE->getSCEV(InvariantAddress);
+ });
+}
+
bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
Value *In0 = const_cast<Value *>(V);
PHINode *PN = dyn_cast_or_null<PHINode>(In0);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3bedf4b..d59abd2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3998,6 +3998,17 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// Set the resume value for this reduction
ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
+ // If there were stores of the reduction value to a uniform memory address
+ // inside the loop, create the final store here.
+ if (StoreInst *SI = RdxDesc.IntermediateStore) {
+ StoreInst *NewSI =
+ Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
+ propagateMetadata(NewSI, SI);
+
+ // If the reduction value is used in other places,
+ // then let the code below create PHI's for that.
+ }
+
// Now, we need to fix the users of the reduction variable
// inside and outside of the scalar remainder loop.
@@ -4244,13 +4255,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
Value *Arg;
- if (!UseVectorIntrinsic || !hasVectorIntrinsicScalarOpd(ID, I.index()))
+ if (!UseVectorIntrinsic ||
+ !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
Arg = State.get(I.value(), Part);
- else {
+ else
Arg = State.get(I.value(), VPIteration(0, 0));
- if (hasVectorIntrinsicOverloadedScalarOpd(ID, I.index()))
- TysForDecl.push_back(Arg->getType());
- }
+ if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
+ TysForDecl.push_back(Arg->getType());
Args.push_back(Arg);
}
@@ -7340,6 +7351,16 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+ // Find all stores to invariant variables. Since they are going to sink
+ // outside the loop we do not need calculate cost for them.
+ for (BasicBlock *BB : TheLoop->blocks())
+ for (Instruction &I : *BB) {
+ StoreInst *SI;
+ if ((SI = dyn_cast<StoreInst>(&I)) &&
+ Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+ ValuesToIgnore.insert(&I);
+ }
+
// Ignore type-promoting instructions we identified during reduction
// detection.
for (auto &Reduction : Legal->getReductionVars()) {
@@ -8329,6 +8350,8 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
return nullptr;
auto willWiden = [&](ElementCount VF) -> bool {
+ if (VF.isScalar())
+ return false;
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// The following case may be scalarized depending on the VF.
// The flag shows whether we use Intrinsic or a usual Call for vectorized
@@ -8843,6 +8866,13 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
continue;
}
+ // Invariant stores inside loop will be deleted and a single store
+ // with the final reduction value will be added to the exit block
+ StoreInst *SI;
+ if ((SI = dyn_cast<StoreInst>(&I)) &&
+ Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+ continue;
+
// Otherwise, if all widening options failed, Instruction is to be
// replicated. This may create a successor for VPBB.
VPBasicBlock *NextVPBB =
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a6b1bb8..4583308 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -641,7 +641,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
- if (hasVectorIntrinsicScalarOpd(ID, i))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
return (CI->getArgOperand(i) == Scalar);
}
LLVM_FALLTHROUGH;
@@ -2042,6 +2042,36 @@ public:
DeletedInstructions.insert(I);
}
+ /// Checks if the instruction was already analyzed for being possible
+ /// reduction root.
+ bool isAnalizedReductionRoot(Instruction *I) const {
+ return AnalizedReductionsRoots.count(I);
+ }
+ /// Register given instruction as already analyzed for being possible
+ /// reduction root.
+ void analyzedReductionRoot(Instruction *I) {
+ AnalizedReductionsRoots.insert(I);
+ }
+ /// Checks if the provided list of reduced values was checked already for
+ /// vectorization.
+ bool areAnalyzedReductionVals(ArrayRef<Value *> VL) {
+ return AnalyzedReductionVals.contains(hash_value(VL));
+ }
+ /// Adds the list of reduced values to list of already checked values for the
+ /// vectorization.
+ void analyzedReductionVals(ArrayRef<Value *> VL) {
+ AnalyzedReductionVals.insert(hash_value(VL));
+ }
+ /// Clear the list of the analyzed reduction root instructions.
+ void clearReductionData() {
+ AnalizedReductionsRoots.clear();
+ AnalyzedReductionVals.clear();
+ }
+ /// Checks if the given value is gathered in one of the nodes.
+ bool isGathered(Value *V) const {
+ return MustGather.contains(V);
+ }
+
~BoUpSLP();
private:
@@ -2603,6 +2633,12 @@ private:
/// previously deleted instruction.
DenseSet<Instruction *> DeletedInstructions;
+ /// Set of the instruction, being analyzed already for reductions.
+ SmallPtrSet<Instruction *, 16> AnalizedReductionsRoots;
+
+ /// Set of hashes for the list of reduction values already being analyzed.
+ DenseSet<size_t> AnalyzedReductionVals;
+
/// A list of values that need to extracted out of the tree.
/// This list holds pairs of (Internal Scalar : External User). External User
/// can be nullptr, it means that this Internal Scalar will be used later,
@@ -4041,6 +4077,83 @@ static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
}
#endif
+/// Generates key/subkey pair for the given value to provide effective sorting
+/// of the values and better detection of the vectorizable values sequences. The
+/// keys/subkeys can be used for better sorting of the values themselves (keys)
+/// and in values subgroups (subkeys).
+static std::pair<size_t, size_t> generateKeySubkey(
+ Value *V, const TargetLibraryInfo *TLI,
+ function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
+ bool AllowAlternate) {
+ hash_code Key = hash_value(V->getValueID() + 2);
+ hash_code SubKey = hash_value(0);
+ // Sort the loads by the distance between the pointers.
+ if (auto *LI = dyn_cast<LoadInst>(V)) {
+ Key = hash_combine(hash_value(Instruction::Load), Key);
+ if (LI->isSimple())
+ SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
+ else
+ SubKey = hash_value(LI);
+ } else if (isVectorLikeInstWithConstOps(V)) {
+ // Sort extracts by the vector operands.
+ if (isa<ExtractElementInst, UndefValue>(V))
+ Key = hash_value(Value::UndefValueVal + 1);
+ if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
+ if (!isUndefVector(EI->getVectorOperand()) &&
+ !isa<UndefValue>(EI->getIndexOperand()))
+ SubKey = hash_value(EI->getVectorOperand());
+ }
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
+ // Sort other instructions just by the opcodes except for CMPInst.
+ // For CMP also sort by the predicate kind.
+ if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) &&
+ isValidForAlternation(I->getOpcode())) {
+ if (AllowAlternate)
+ Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
+ else
+ Key = hash_combine(hash_value(I->getOpcode()), Key);
+ SubKey = hash_combine(
+ hash_value(I->getOpcode()), hash_value(I->getType()),
+ hash_value(isa<BinaryOperator>(I)
+ ? I->getType()
+ : cast<CastInst>(I)->getOperand(0)->getType()));
+ } else if (auto *CI = dyn_cast<CmpInst>(I)) {
+ CmpInst::Predicate Pred = CI->getPredicate();
+ if (CI->isCommutative())
+ Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
+ CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
+ SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
+ hash_value(SwapPred),
+ hash_value(CI->getOperand(0)->getType()));
+ } else if (auto *Call = dyn_cast<CallInst>(I)) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
+ if (isTriviallyVectorizable(ID))
+ SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
+ else if (!VFDatabase(*Call).getMappings(*Call).empty())
+ SubKey = hash_combine(hash_value(I->getOpcode()),
+ hash_value(Call->getCalledFunction()));
+ else
+ SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
+ for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
+ SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
+ hash_value(Op.Tag), SubKey);
+ } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
+ if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
+ SubKey = hash_value(Gep->getPointerOperand());
+ else
+ SubKey = hash_value(Gep);
+ } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
+ !isa<ConstantInt>(I->getOperand(1))) {
+ // Do not try to vectorize instructions with potentially high cost.
+ SubKey = hash_value(I);
+ } else {
+ SubKey = hash_value(I->getOpcode());
+ }
+ Key = hash_combine(hash_value(I->getParent()), Key);
+ }
+ return std::make_pair(Key, SubKey);
+}
+
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
const EdgeInfo &UserTreeIdx) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -4742,7 +4855,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
unsigned NumArgs = CI->arg_size();
SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
for (unsigned j = 0; j != NumArgs; ++j)
- if (hasVectorIntrinsicScalarOpd(ID, j))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, j))
ScalarArgs[j] = CI->getArgOperand(j);
for (Value *V : VL) {
CallInst *CI2 = dyn_cast<CallInst>(V);
@@ -4761,7 +4874,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Some intrinsics have scalar arguments and should be same in order for
// them to be vectorized.
for (unsigned j = 0; j != NumArgs; ++j) {
- if (hasVectorIntrinsicScalarOpd(ID, j)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) {
Value *A1J = CI2->getArgOperand(j);
if (ScalarArgs[j] != A1J) {
BS.cancelScheduling(VL, VL0);
@@ -4794,7 +4907,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
// For scalar operands no need to to create an entry since no need to
// vectorize it.
- if (hasVectorIntrinsicScalarOpd(ID, i))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
continue;
ValueList Operands;
// Prepare the operand vector.
@@ -6238,10 +6351,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
// Find the insertvector, vectorized in tree, if any.
Value *Base = VU;
- while (isa<InsertElementInst>(Base)) {
+ while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
// Build the mask for the vectorized insertelement instructions.
- if (const TreeEntry *E = getTreeEntry(Base)) {
- VU = cast<InsertElementInst>(Base);
+ if (const TreeEntry *E = getTreeEntry(IEBase)) {
+ VU = IEBase;
do {
int Idx = E->findLaneForValue(Base);
ShuffleMask.back()[Idx] = Idx;
@@ -6257,8 +6370,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
} else {
VecId = std::distance(FirstUsers.begin(), It);
}
- ShuffleMask[VecId][*InsertIdx] = EU.Lane;
- DemandedElts[VecId].setBit(*InsertIdx);
+ int InIdx = *InsertIdx;
+ ShuffleMask[VecId][InIdx] = EU.Lane;
+ DemandedElts[VecId].setBit(InIdx);
continue;
}
}
@@ -6459,6 +6573,12 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
}
}
+ if (UsedTEs.empty()) {
+ assert(all_of(TE->Scalars, UndefValue::classof) &&
+ "Expected vector of undefs only.");
+ return None;
+ }
+
unsigned VF = 0;
if (UsedTEs.size() == 1) {
// Try to find the perfect match in another gather node at first.
@@ -6612,11 +6732,15 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
// should not be scheduled.
if (E->State != TreeEntry::NeedToGather &&
doesNotNeedToSchedule(E->Scalars)) {
- BasicBlock::iterator InsertPt;
+ Instruction *InsertInst;
if (all_of(E->Scalars, isUsedOutsideBlock))
- InsertPt = FindLastInst()->getIterator();
+ InsertInst = FindLastInst();
else
- InsertPt = FindFirstInst()->getIterator();
+ InsertInst = FindFirstInst();
+ // If the instruction is PHI, set the insert point after all the PHIs.
+ if (isa<PHINode>(InsertInst))
+ InsertInst = BB->getFirstNonPHI();
+ BasicBlock::iterator InsertPt = InsertInst->getIterator();
Builder.SetInsertPoint(BB, InsertPt);
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
return;
@@ -6658,13 +6782,17 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
// not ideal. However, this should be exceedingly rare since it requires that
// we both exit early from buildTree_rec and that the bundle be out-of-order
// (causing us to iterate all the way to the end of the block).
- if (!LastInst)
+ if (!LastInst) {
LastInst = FindLastInst();
+ // If the instruction is PHI, set the insert point after all the PHIs.
+ if (isa<PHINode>(LastInst))
+ LastInst = BB->getFirstNonPHI()->getPrevNode();
+ }
assert(LastInst && "Failed to find last instruction in bundle");
// Set the insertion point after the last instruction in the bundle. Set the
// debug location to Front.
- Builder.SetInsertPoint(BB, ++LastInst->getIterator());
+ Builder.SetInsertPoint(BB, std::next(LastInst->getIterator()));
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
@@ -7358,11 +7486,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
// vectorized.
- if (UseIntrinsic && hasVectorIntrinsicScalarOpd(IID, j)) {
+ if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) {
CallInst *CEI = cast<CallInst>(VL0);
ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
- if (hasVectorIntrinsicOverloadedScalarOpd(IID, j))
+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
TysForDecl.push_back(ScalarArg->getType());
continue;
}
@@ -7370,6 +7498,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *OpVec = vectorizeTree(E->getOperand(j));
LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
+ TysForDecl.push_back(OpVec->getType());
}
Function *CF;
@@ -8804,6 +8934,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// Scan the blocks in the function in post order.
for (auto BB : post_order(&F.getEntryBlock())) {
+ // Start new block - clear the list of reduction roots.
+ R.clearReductionData();
collectSeedInstructions(BB);
// Vectorize trees that end at stores.
@@ -9273,15 +9405,16 @@ class HorizontalReduction {
using ReductionOpsType = SmallVector<Value *, 16>;
using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
ReductionOpsListType ReductionOps;
- SmallVector<Value *, 32> ReducedVals;
+ /// List of possibly reduced values.
+ SmallVector<SmallVector<Value *>> ReducedVals;
+ /// Maps reduced value to the corresponding reduction operation.
+ DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
// Use map vector to make stable output.
MapVector<Instruction *, Value *> ExtraArgs;
WeakTrackingVH ReductionRoot;
/// The type of reduction operation.
RecurKind RdxKind;
- const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max();
-
static bool isCmpSelMinMax(Instruction *I) {
return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
@@ -9325,26 +9458,6 @@ class HorizontalReduction {
return I->getOperand(Index);
}
- /// Checks if the ParentStackElem.first should be marked as a reduction
- /// operation with an extra argument or as extra argument itself.
- void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
- Value *ExtraArg) {
- if (ExtraArgs.count(ParentStackElem.first)) {
- ExtraArgs[ParentStackElem.first] = nullptr;
- // We ran into something like:
- // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
- // The whole ParentStackElem.first should be considered as an extra value
- // in this case.
- // Do not perform analysis of remaining operands of ParentStackElem.first
- // instruction, this whole instruction is an extra argument.
- ParentStackElem.second = INVALID_OPERAND_INDEX;
- } else {
- // We ran into something like:
- // ParentStackElem.first += ... + ExtraArg + ...
- ExtraArgs[ParentStackElem.first] = ExtraArg;
- }
- }
-
/// Creates reduction operation with the current opcode.
static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
Value *RHS, const Twine &Name, bool UseSelect) {
@@ -9429,7 +9542,7 @@ class HorizontalReduction {
/// Creates reduction operation with the current opcode with the IR flags
/// from \p I.
static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
- Value *RHS, const Twine &Name, Instruction *I) {
+ Value *RHS, const Twine &Name, Value *I) {
auto *SelI = dyn_cast<SelectInst>(I);
Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr);
if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
@@ -9440,8 +9553,10 @@ class HorizontalReduction {
return Op;
}
- static RecurKind getRdxKind(Instruction *I) {
- assert(I && "Expected instruction for reduction matching");
+ static RecurKind getRdxKind(Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return RecurKind::None;
if (match(I, m_Add(m_Value(), m_Value())))
return RecurKind::Add;
if (match(I, m_Mul(m_Value(), m_Value())))
@@ -9603,7 +9718,9 @@ public:
HorizontalReduction() = default;
/// Try to find a reduction tree.
- bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) {
+ bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst,
+ ScalarEvolution &SE, const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
assert((!Phi || is_contained(Phi->operands(), Inst)) &&
"Phi needs to use the binary operator");
assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||
@@ -9647,124 +9764,168 @@ public:
ReductionRoot = Inst;
- // The opcode for leaf values that we perform a reduction on.
- // For example: load(x) + load(y) + load(z) + fptoui(w)
- // The leaf opcode for 'w' does not match, so we don't include it as a
- // potential candidate for the reduction.
- unsigned LeafOpcode = 0;
-
- // Post-order traverse the reduction tree starting at Inst. We only handle
- // true trees containing binary operators or selects.
- SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
- Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst)));
- initReductionOps(Inst);
- while (!Stack.empty()) {
- Instruction *TreeN = Stack.back().first;
- unsigned EdgeToVisit = Stack.back().second++;
- const RecurKind TreeRdxKind = getRdxKind(TreeN);
- bool IsReducedValue = TreeRdxKind != RdxKind;
-
- // Postorder visit.
- if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) {
- if (IsReducedValue)
- ReducedVals.push_back(TreeN);
- else {
- auto ExtraArgsIter = ExtraArgs.find(TreeN);
- if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) {
- // Check if TreeN is an extra argument of its parent operation.
- if (Stack.size() <= 1) {
- // TreeN can't be an extra argument as it is a root reduction
- // operation.
- return false;
- }
- // Yes, TreeN is an extra argument, do not add it to a list of
- // reduction operations.
- // Stack[Stack.size() - 2] always points to the parent operation.
- markExtraArg(Stack[Stack.size() - 2], TreeN);
- ExtraArgs.erase(TreeN);
- } else
- addReductionOps(TreeN);
+ // Iterate through all the operands of the possible reduction tree and
+ // gather all the reduced values, sorting them by their value id.
+ BasicBlock *BB = Inst->getParent();
+ bool IsCmpSelMinMax = isCmpSelMinMax(Inst);
+ SmallVector<Instruction *> Worklist(1, Inst);
+ // Checks if the operands of the \p TreeN instruction are also reduction
+ // operations or should be treated as reduced values or an extra argument,
+ // which is not part of the reduction.
+ auto &&CheckOperands = [this, IsCmpSelMinMax,
+ BB](Instruction *TreeN,
+ SmallVectorImpl<Value *> &ExtraArgs,
+ SmallVectorImpl<Value *> &PossibleReducedVals,
+ SmallVectorImpl<Instruction *> &ReductionOps) {
+ for (int I = getFirstOperandIndex(TreeN),
+ End = getNumberOfOperands(TreeN);
+ I < End; ++I) {
+ Value *EdgeVal = getRdxOperand(TreeN, I);
+ ReducedValsToOps[EdgeVal].push_back(TreeN);
+ auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
+ // Edge has wrong parent - mark as an extra argument.
+ if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
+ !hasSameParent(EdgeInst, BB)) {
+ ExtraArgs.push_back(EdgeVal);
+ continue;
}
- // Retract.
- Stack.pop_back();
- continue;
- }
-
- // Visit operands.
- Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit);
- auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
- if (!EdgeInst) {
- // Edge value is not a reduction instruction or a leaf instruction.
- // (It may be a constant, function argument, or something else.)
- markExtraArg(Stack.back(), EdgeVal);
- continue;
+ // If the edge is not an instruction, or it is different from the main
+ // reduction opcode or has too many uses - possible reduced value.
+ if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
+ !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
+ !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) {
+ PossibleReducedVals.push_back(EdgeVal);
+ continue;
+ }
+ ReductionOps.push_back(EdgeInst);
}
- RecurKind EdgeRdxKind = getRdxKind(EdgeInst);
- // Continue analysis if the next operand is a reduction operation or
- // (possibly) a leaf value. If the leaf value opcode is not set,
- // the first met operation != reduction operation is considered as the
- // leaf opcode.
- // Only handle trees in the current basic block.
- // Each tree node needs to have minimal number of users except for the
- // ultimate reduction.
- const bool IsRdxInst = EdgeRdxKind == RdxKind;
- if (EdgeInst != Phi && EdgeInst != Inst &&
- hasSameParent(EdgeInst, Inst->getParent()) &&
- hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) &&
- (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) {
- if (IsRdxInst) {
- // We need to be able to reassociate the reduction operations.
- if (!isVectorizable(EdgeRdxKind, EdgeInst)) {
- // I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), EdgeInst);
- continue;
- }
- } else if (!LeafOpcode) {
- LeafOpcode = EdgeInst->getOpcode();
+ };
+ // Try to regroup reduced values so that it gets more profitable to try to
+ // reduce them. Values are grouped by their value ids, instructions - by
+ // instruction op id and/or alternate op id, plus do extra analysis for
+ // loads (grouping them by the distabce between pointers) and cmp
+ // instructions (grouping them by the predicate).
+ MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
+ PossibleReducedVals;
+ initReductionOps(Inst);
+ while (!Worklist.empty()) {
+ Instruction *TreeN = Worklist.pop_back_val();
+ SmallVector<Value *> Args;
+ SmallVector<Value *> PossibleRedVals;
+ SmallVector<Instruction *> PossibleReductionOps;
+ CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
+ // If too many extra args - mark the instruction itself as a reduction
+ // value, not a reduction operation.
+ if (Args.size() < 2) {
+ addReductionOps(TreeN);
+ // Add extra args.
+ if (!Args.empty()) {
+ assert(Args.size() == 1 && "Expected only single argument.");
+ ExtraArgs[TreeN] = Args.front();
}
- Stack.push_back(
- std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst)));
- continue;
+ // Add reduction values. The values are sorted for better vectorization
+ // results.
+ for (Value *V : PossibleRedVals) {
+ size_t Key, Idx;
+ std::tie(Key, Idx) = generateKeySubkey(
+ V, &TLI,
+ [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
+ for (const auto &LoadData : PossibleReducedVals[Key]) {
+ auto *RLI = cast<LoadInst>(LoadData.second.front().first);
+ if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
+ LI->getType(), LI->getPointerOperand(),
+ DL, SE, /*StrictCheck=*/true))
+ return hash_value(RLI->getPointerOperand());
+ }
+ return hash_value(LI->getPointerOperand());
+ },
+ /*AllowAlternate=*/false);
+ ++PossibleReducedVals[Key][Idx]
+ .insert(std::make_pair(V, 0))
+ .first->second;
+ }
+ Worklist.append(PossibleReductionOps.rbegin(),
+ PossibleReductionOps.rend());
+ } else {
+ size_t Key, Idx;
+ std::tie(Key, Idx) = generateKeySubkey(
+ TreeN, &TLI,
+ [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
+ for (const auto &LoadData : PossibleReducedVals[Key]) {
+ auto *RLI = cast<LoadInst>(LoadData.second.front().first);
+ if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
+ LI->getType(), LI->getPointerOperand(), DL,
+ SE, /*StrictCheck=*/true))
+ return hash_value(RLI->getPointerOperand());
+ }
+ return hash_value(LI->getPointerOperand());
+ },
+ /*AllowAlternate=*/false);
+ ++PossibleReducedVals[Key][Idx]
+ .insert(std::make_pair(TreeN, 0))
+ .first->second;
}
- // I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), EdgeInst);
}
+ auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
+ // Sort values by the total number of values kinds to start the reduction
+ // from the longest possible reduced values sequences.
+ for (auto &PossibleReducedVals : PossibleReducedValsVect) {
+ auto PossibleRedVals = PossibleReducedVals.second.takeVector();
+ SmallVector<SmallVector<Value *>> PossibleRedValsVect;
+ for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
+ It != E; ++It) {
+ PossibleRedValsVect.emplace_back();
+ auto RedValsVect = It->second.takeVector();
+ stable_sort(RedValsVect, [](const auto &P1, const auto &P2) {
+ return P1.second < P2.second;
+ });
+ for (const std::pair<Value *, unsigned> &Data : RedValsVect)
+ PossibleRedValsVect.back().append(Data.second, Data.first);
+ }
+ stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
+ return P1.size() > P2.size();
+ });
+ ReducedVals.emplace_back();
+ for (ArrayRef<Value *> Data : PossibleRedValsVect)
+ ReducedVals.back().append(Data.rbegin(), Data.rend());
+ }
+ // Sort the reduced values by number of same/alternate opcode and/or pointer
+ // operand.
+ stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
+ return P1.size() > P2.size();
+ });
return true;
}
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+ constexpr int ReductionLimit = 4;
// If there are a sufficient number of reduction values, reduce
// to a nearby power-of-2. We can safely generate oversized
// vectors and rely on the backend to split them to legal sizes.
- unsigned NumReducedVals = ReducedVals.size();
- if (NumReducedVals < 4)
+ unsigned NumReducedVals = std::accumulate(
+ ReducedVals.begin(), ReducedVals.end(), 0,
+ [](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); });
+ if (NumReducedVals < ReductionLimit)
return nullptr;
- // Intersect the fast-math-flags from all reduction operations.
- FastMathFlags RdxFMF;
- RdxFMF.set();
- for (ReductionOpsType &RdxOp : ReductionOps) {
- for (Value *RdxVal : RdxOp) {
- if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal))
- RdxFMF &= FPMO->getFastMathFlags();
- }
- }
-
IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
- Builder.setFastMathFlags(RdxFMF);
+ // Track the reduced values in case if they are replaced by extractelement
+ // because of the vectorization.
+ DenseMap<Value *, WeakTrackingVH> TrackedVals;
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
// The same extra argument may be used several times, so log each attempt
// to use it.
for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
assert(Pair.first && "DebugLoc must be set.");
ExternallyUsedValues[Pair.second].push_back(Pair.first);
+ TrackedVals.try_emplace(Pair.second, Pair.second);
}
// The compare instruction of a min/max is the insertion point for new
// instructions and may be replaced with a new compare instruction.
- auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
+ auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
assert(isa<SelectInst>(RdxRootInst) &&
"Expected min/max reduction to have select root instruction");
Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
@@ -9776,141 +9937,289 @@ public:
// The reduction root is used as the insertion point for new instructions,
// so set it as externally used to prevent it from being deleted.
ExternallyUsedValues[ReductionRoot];
- SmallVector<Value *, 16> IgnoreList;
- for (ReductionOpsType &RdxOp : ReductionOps)
- IgnoreList.append(RdxOp.begin(), RdxOp.end());
-
- unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
- if (NumReducedVals > ReduxWidth) {
- // In the loop below, we are building a tree based on a window of
- // 'ReduxWidth' values.
- // If the operands of those values have common traits (compare predicate,
- // constant operand, etc), then we want to group those together to
- // minimize the cost of the reduction.
-
- // TODO: This should be extended to count common operands for
- // compares and binops.
-
- // Step 1: Count the number of times each compare predicate occurs.
- SmallDenseMap<unsigned, unsigned> PredCountMap;
- for (Value *RdxVal : ReducedVals) {
- CmpInst::Predicate Pred;
- if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))
- ++PredCountMap[Pred];
- }
- // Step 2: Sort the values so the most common predicates come first.
- stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {
- CmpInst::Predicate PredA, PredB;
- if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&
- match(B, m_Cmp(PredB, m_Value(), m_Value()))) {
- return PredCountMap[PredA] > PredCountMap[PredB];
- }
- return false;
- });
- }
+ SmallVector<Value *> IgnoreList;
+ for (ReductionOpsType &RdxOps : ReductionOps)
+ for (Value *RdxOp : RdxOps) {
+ if (!RdxOp)
+ continue;
+ IgnoreList.push_back(RdxOp);
+ }
+ bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
+
+ // Need to track reduced vals, they may be changed during vectorization of
+ // subvectors.
+ for (ArrayRef<Value *> Candidates : ReducedVals)
+ for (Value *V : Candidates)
+ TrackedVals.try_emplace(V, V);
+ DenseMap<Value *, unsigned> VectorizedVals;
Value *VectorizedTree = nullptr;
- unsigned i = 0;
- while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
- ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
- V.buildTree(VL, IgnoreList);
- if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true))
- break;
- if (V.isLoadCombineReductionCandidate(RdxKind))
- break;
- V.reorderTopToBottom();
- V.reorderBottomToTop(/*IgnoreReorder=*/true);
- V.buildExternalUses(ExternallyUsedValues);
-
- // For a poison-safe boolean logic reduction, do not replace select
- // instructions with logic ops. All reduced values will be frozen (see
- // below) to prevent leaking poison.
- if (isa<SelectInst>(ReductionRoot) &&
- isBoolLogicOp(cast<Instruction>(ReductionRoot)) &&
- NumReducedVals != ReduxWidth)
- break;
+ bool CheckForReusedReductionOps = false;
+ // Try to vectorize elements based on their type.
+ for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
+ ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
+ InstructionsState S = getSameOpcode(OrigReducedVals);
+ SmallVector<Value *> Candidates;
+ DenseMap<Value *, Value *> TrackedToOrig;
+ for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
+ Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
+ // Check if the reduction value was not overriden by the extractelement
+ // instruction because of the vectorization and exclude it, if it is not
+ // compatible with other values.
+ if (auto *Inst = dyn_cast<Instruction>(RdxVal))
+ if (isVectorLikeInstWithConstOps(Inst) &&
+ (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)))
+ continue;
+ Candidates.push_back(RdxVal);
+ TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
+ }
+ bool ShuffledExtracts = false;
+ // Try to handle shuffled extractelements.
+ if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
+ I + 1 < E) {
+ InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]);
+ if (NextS.getOpcode() == Instruction::ExtractElement &&
+ !NextS.isAltShuffle()) {
+ SmallVector<Value *> CommonCandidates(Candidates);
+ for (Value *RV : ReducedVals[I + 1]) {
+ Value *RdxVal = TrackedVals.find(RV)->second;
+ // Check if the reduction value was not overriden by the
+ // extractelement instruction because of the vectorization and
+ // exclude it, if it is not compatible with other values.
+ if (auto *Inst = dyn_cast<Instruction>(RdxVal))
+ if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
+ continue;
+ CommonCandidates.push_back(RdxVal);
+ TrackedToOrig.try_emplace(RdxVal, RV);
+ }
+ SmallVector<int> Mask;
+ if (isFixedVectorShuffle(CommonCandidates, Mask)) {
+ ++I;
+ Candidates.swap(CommonCandidates);
+ ShuffledExtracts = true;
+ }
+ }
+ }
+ unsigned NumReducedVals = Candidates.size();
+ if (NumReducedVals < ReductionLimit)
+ continue;
- V.computeMinimumValueSizes();
+ unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
+ unsigned Start = 0;
+ unsigned Pos = Start;
+ // Restarts vectorization attempt with lower vector factor.
+ unsigned PrevReduxWidth = ReduxWidth;
+ bool CheckForReusedReductionOpsLocal = false;
+ auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
+ &CheckForReusedReductionOpsLocal,
+ &PrevReduxWidth, &V,
+ &IgnoreList](bool IgnoreVL = false) {
+ bool IsAnyRedOpGathered =
+ !IgnoreVL && any_of(IgnoreList, [&V](Value *RedOp) {
+ return V.isGathered(RedOp);
+ });
+ if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
+ // Check if any of the reduction ops are gathered. If so, worth
+ // trying again with less number of reduction ops.
+ CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
+ }
+ ++Pos;
+ if (Pos < NumReducedVals - ReduxWidth + 1)
+ return IsAnyRedOpGathered;
+ Pos = Start;
+ ReduxWidth /= 2;
+ return IsAnyRedOpGathered;
+ };
+ while (Pos < NumReducedVals - ReduxWidth + 1 &&
+ ReduxWidth >= ReductionLimit) {
+ // Dependency in tree of the reduction ops - drop this attempt, try
+ // later.
+ if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
+ Start == 0) {
+ CheckForReusedReductionOps = true;
+ break;
+ }
+ PrevReduxWidth = ReduxWidth;
+ ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
+ // Beeing analyzed already - skip.
+ if (V.areAnalyzedReductionVals(VL)) {
+ (void)AdjustReducedVals(/*IgnoreVL=*/true);
+ continue;
+ }
+ // Early exit if any of the reduction values were deleted during
+ // previous vectorization attempts.
+ if (any_of(VL, [&V](Value *RedVal) {
+ auto *RedValI = dyn_cast<Instruction>(RedVal);
+ if (!RedValI)
+ return false;
+ return V.isDeleted(RedValI);
+ }))
+ break;
+ V.buildTree(VL, IgnoreList);
+ if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
+ if (!AdjustReducedVals())
+ V.analyzedReductionVals(VL);
+ continue;
+ }
+ if (V.isLoadCombineReductionCandidate(RdxKind)) {
+ if (!AdjustReducedVals())
+ V.analyzedReductionVals(VL);
+ continue;
+ }
+ V.reorderTopToBottom();
+ // No need to reorder the root node at all.
+ V.reorderBottomToTop(/*IgnoreReorder=*/true);
+ // Keep extracted other reduction values, if they are used in the
+ // vectorization trees.
+ BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
+ ExternallyUsedValues);
+ for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
+ if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
+ continue;
+ for_each(ReducedVals[Cnt],
+ [&LocalExternallyUsedValues, &TrackedVals](Value *V) {
+ if (isa<Instruction>(V))
+ LocalExternallyUsedValues[TrackedVals[V]];
+ });
+ }
+ for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
+ if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
+ continue;
+ if (VectorizedVals.count(Candidates[Cnt]))
+ continue;
+ LocalExternallyUsedValues[Candidates[Cnt]];
+ }
+ V.buildExternalUses(LocalExternallyUsedValues);
+
+ V.computeMinimumValueSizes();
+
+ // Intersect the fast-math-flags from all reduction operations.
+ FastMathFlags RdxFMF;
+ RdxFMF.set();
+ for (Value *U : IgnoreList)
+ if (auto *FPMO = dyn_cast<FPMathOperator>(U))
+ RdxFMF &= FPMO->getFastMathFlags();
+ // Estimate cost.
+ InstructionCost TreeCost = V.getTreeCost(VL);
+ InstructionCost ReductionCost =
+ getReductionCost(TTI, VL[0], ReduxWidth, RdxFMF);
+ InstructionCost Cost = TreeCost + ReductionCost;
+ if (!Cost.isValid()) {
+ LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
+ return nullptr;
+ }
+ if (Cost >= -SLPCostThreshold) {
+ V.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(
+ SV_NAME, "HorSLPNotBeneficial",
+ ReducedValsToOps.find(VL[0])->second.front())
+ << "Vectorizing horizontal reduction is possible"
+ << "but not beneficial with cost " << ore::NV("Cost", Cost)
+ << " and threshold "
+ << ore::NV("Threshold", -SLPCostThreshold);
+ });
+ if (!AdjustReducedVals())
+ V.analyzedReductionVals(VL);
+ continue;
+ }
- // Estimate cost.
- InstructionCost TreeCost =
- V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth));
- InstructionCost ReductionCost =
- getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF);
- InstructionCost Cost = TreeCost + ReductionCost;
- if (!Cost.isValid()) {
- LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
- return nullptr;
- }
- if (Cost >= -SLPCostThreshold) {
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+ << Cost << ". (HorRdx)\n");
V.getORE()->emit([&]() {
- return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
- cast<Instruction>(VL[0]))
- << "Vectorizing horizontal reduction is possible"
- << "but not beneficial with cost " << ore::NV("Cost", Cost)
- << " and threshold "
- << ore::NV("Threshold", -SLPCostThreshold);
+ return OptimizationRemark(
+ SV_NAME, "VectorizedHorizontalReduction",
+ ReducedValsToOps.find(VL[0])->second.front())
+ << "Vectorized horizontal reduction with cost "
+ << ore::NV("Cost", Cost) << " and with tree size "
+ << ore::NV("TreeSize", V.getTreeSize());
});
- break;
- }
- LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
- << Cost << ". (HorRdx)\n");
- V.getORE()->emit([&]() {
- return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
- cast<Instruction>(VL[0]))
- << "Vectorized horizontal reduction with cost "
- << ore::NV("Cost", Cost) << " and with tree size "
- << ore::NV("TreeSize", V.getTreeSize());
- });
+ Builder.setFastMathFlags(RdxFMF);
- // Vectorize a tree.
- DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
- Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
+ // Vectorize a tree.
+ Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues);
- // Emit a reduction. If the root is a select (min/max idiom), the insert
- // point is the compare condition of that select.
- Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
- if (isCmpSelMinMax(RdxRootInst))
- Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
- else
- Builder.SetInsertPoint(RdxRootInst);
+ // Emit a reduction. If the root is a select (min/max idiom), the insert
+ // point is the compare condition of that select.
+ Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
+ if (IsCmpSelMinMax)
+ Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst));
+ else
+ Builder.SetInsertPoint(RdxRootInst);
- // To prevent poison from leaking across what used to be sequential, safe,
- // scalar boolean logic operations, the reduction operand must be frozen.
- if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
- VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
+ // To prevent poison from leaking across what used to be sequential,
+ // safe, scalar boolean logic operations, the reduction operand must be
+ // frozen.
+ if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
+ VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
- Value *ReducedSubTree =
- emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
+ Value *ReducedSubTree =
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
- if (!VectorizedTree) {
- // Initialize the final value in the reduction.
- VectorizedTree = ReducedSubTree;
- } else {
- // Update the final value in the reduction.
- Builder.SetCurrentDebugLocation(Loc);
- VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
- ReducedSubTree, "op.rdx", ReductionOps);
+ if (!VectorizedTree) {
+ // Initialize the final value in the reduction.
+ VectorizedTree = ReducedSubTree;
+ } else {
+ // Update the final value in the reduction.
+ Builder.SetCurrentDebugLocation(
+ cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
+ VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+ ReducedSubTree, "op.rdx", ReductionOps);
+ }
+ // Count vectorized reduced values to exclude them from final reduction.
+ for (Value *V : VL)
+ ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0)
+ .first->getSecond();
+ Pos += ReduxWidth;
+ Start = Pos;
+ ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);
}
- i += ReduxWidth;
- ReduxWidth = PowerOf2Floor(NumReducedVals - i);
}
-
if (VectorizedTree) {
// Finish the reduction.
- for (; i < NumReducedVals; ++i) {
- auto *I = cast<Instruction>(ReducedVals[i]);
- Builder.SetCurrentDebugLocation(I->getDebugLoc());
- VectorizedTree =
- createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
+ // Need to add extra arguments and not vectorized possible reduction
+ // values.
+ SmallPtrSet<Value *, 8> Visited;
+ for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
+ ArrayRef<Value *> Candidates = ReducedVals[I];
+ for (Value *RdxVal : Candidates) {
+ if (!Visited.insert(RdxVal).second)
+ continue;
+ Value *StableRdxVal = RdxVal;
+ auto TVIt = TrackedVals.find(RdxVal);
+ if (TVIt != TrackedVals.end())
+ StableRdxVal = TVIt->second;
+ unsigned NumOps = 0;
+ auto It = VectorizedVals.find(RdxVal);
+ if (It != VectorizedVals.end())
+ NumOps = It->second;
+ for (Instruction *RedOp :
+ makeArrayRef(ReducedValsToOps.find(RdxVal)->second)
+ .drop_back(NumOps)) {
+ Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
+ ReductionOpsListType Ops;
+ if (auto *Sel = dyn_cast<SelectInst>(RedOp))
+ Ops.emplace_back().push_back(Sel->getCondition());
+ Ops.emplace_back().push_back(RedOp);
+ VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+ StableRdxVal, "op.rdx", Ops);
+ }
+ }
}
for (auto &Pair : ExternallyUsedValues) {
// Add each externally used value to the final reduction.
for (auto *I : Pair.second) {
Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ ReductionOpsListType Ops;
+ if (auto *Sel = dyn_cast<SelectInst>(I))
+ Ops.emplace_back().push_back(Sel->getCondition());
+ Ops.emplace_back().push_back(I);
+ Value *StableRdxVal = Pair.first;
+ auto TVIt = TrackedVals.find(Pair.first);
+ if (TVIt != TrackedVals.end())
+ StableRdxVal = TVIt->second;
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
- Pair.first, "op.extra", I);
+ StableRdxVal, "op.rdx", Ops);
}
}
@@ -9922,20 +10231,30 @@ public:
// deletion.
#ifndef NDEBUG
SmallSet<Value *, 4> IgnoreSet;
- IgnoreSet.insert(IgnoreList.begin(), IgnoreList.end());
+ for (ArrayRef<Value *> RdxOps : ReductionOps)
+ IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
#endif
- for (auto *Ignore : IgnoreList) {
+ for (ArrayRef<Value *> RdxOps : ReductionOps) {
+ for (Value *Ignore : RdxOps) {
+ if (!Ignore)
+ continue;
#ifndef NDEBUG
- for (auto *U : Ignore->users()) {
- assert(IgnoreSet.count(U));
- }
+ for (auto *U : Ignore->users()) {
+ assert(IgnoreSet.count(U) &&
+ "All users must be either in the reduction ops list.");
+ }
#endif
- if (!Ignore->use_empty()) {
- Value *Undef = UndefValue::get(Ignore->getType());
- Ignore->replaceAllUsesWith(Undef);
+ if (!Ignore->use_empty()) {
+ Value *Undef = UndefValue::get(Ignore->getType());
+ Ignore->replaceAllUsesWith(Undef);
+ }
+ V.eraseInstruction(cast<Instruction>(Ignore));
}
- V.eraseInstruction(cast<Instruction>(Ignore));
}
+ } else if (!CheckForReusedReductionOps) {
+ for (ReductionOpsType &RdxOps : ReductionOps)
+ for (Value *RdxOp : RdxOps)
+ V.analyzedReductionRoot(cast<Instruction>(RdxOp));
}
return VectorizedTree;
}
@@ -10201,7 +10520,8 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
/// performed.
static bool tryToVectorizeHorReductionOrInstOperands(
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
- TargetTransformInfo *TTI,
+ TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL,
+ const TargetLibraryInfo &TLI,
const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
if (!ShouldVectorizeHor)
return false;
@@ -10220,7 +10540,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// horizontal reduction.
// Interrupt the process if the Root instruction itself was vectorized or all
// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
- // Skip the analysis of CmpInsts.Compiler implements postanalysis of the
+ // Skip the analysis of CmpInsts. Compiler implements postanalysis of the
// CmpInsts so we can skip extra attempts in
// tryToVectorizeHorReductionOrInstOperands and save compile time.
std::queue<std::pair<Instruction *, unsigned>> Stack;
@@ -10228,13 +10548,16 @@ static bool tryToVectorizeHorReductionOrInstOperands(
SmallPtrSet<Value *, 8> VisitedInstrs;
SmallVector<WeakTrackingVH> PostponedInsts;
bool Res = false;
- auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0,
- Value *&B1) -> Value * {
+ auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst,
+ Value *&B0,
+ Value *&B1) -> Value * {
+ if (R.isAnalizedReductionRoot(Inst))
+ return nullptr;
bool IsBinop = matchRdxBop(Inst, B0, B1);
bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
if (IsBinop || IsSelect) {
HorizontalReduction HorRdx;
- if (HorRdx.matchAssociativeReduction(P, Inst))
+ if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI))
return HorRdx.tryToReduce(R, TTI);
}
return nullptr;
@@ -10279,7 +10602,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// Do not try to vectorize CmpInst operands, this is done separately.
// Final attempt for binop args vectorization should happen after the loop
// to try to find reductions.
- if (!isa<CmpInst>(Inst))
+ if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst))
PostponedInsts.push_back(Inst);
}
@@ -10292,8 +10615,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
if (auto *I = dyn_cast<Instruction>(Op))
// Do not try to vectorize CmpInst operands, this is done
// separately.
- if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) &&
- I->getParent() == BB)
+ if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
+ !R.isDeleted(I) && I->getParent() == BB)
Stack.emplace(I, Level);
}
// Try to vectorized binops where reductions were not found.
@@ -10317,8 +10640,8 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
return tryToVectorize(I, R);
};
- return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
- ExtraVectorization);
+ return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL,
+ *TLI, ExtraVectorization);
}
bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
@@ -10486,12 +10809,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
for (auto *I : reverse(Instructions)) {
if (R.isDeleted(I))
continue;
- if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+ if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
- else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+ } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
- else if (isa<CmpInst>(I))
+ } else if (isa<CmpInst>(I)) {
PostponedCmps.push_back(I);
+ continue;
+ }
+ // Try to find reductions in buildvector sequnces.
+ OpsChanged |= vectorizeRootInstruction(nullptr, I, BB, R, TTI);
}
if (AtTerminator) {
// Try to find reductions first.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 02550dad..21bd231 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1420,6 +1420,9 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
getCondOp()->printAsOperand(O, SlotTracker);
}
O << ")";
+ if (RdxDesc->IntermediateStore)
+ O << " (with final reduction value stored in invariant address sank "
+ "outside of loop)";
}
void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 05fc8c6..f26babe 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -257,12 +257,12 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
ExtractElementInst *VectorCombine::getShuffleExtract(
ExtractElementInst *Ext0, ExtractElementInst *Ext1,
unsigned PreferredExtractIndex = InvalidIndex) const {
- assert(isa<ConstantInt>(Ext0->getIndexOperand()) &&
- isa<ConstantInt>(Ext1->getIndexOperand()) &&
- "Expected constant extract indexes");
+ auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
+ auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
+ assert(Index0C && Index1C && "Expected constant extract indexes");
- unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue();
- unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue();
+ unsigned Index0 = Index0C->getZExtValue();
+ unsigned Index1 = Index1C->getZExtValue();
// If the extract indexes are identical, no shuffle is needed.
if (Index0 == Index1)
@@ -308,9 +308,10 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
const Instruction &I,
ExtractElementInst *&ConvertToShuffle,
unsigned PreferredExtractIndex) {
- assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
- isa<ConstantInt>(Ext1->getOperand(1)) &&
- "Expected constant extract indexes");
+ auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1));
+ auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1));
+ assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
+
unsigned Opcode = I.getOpcode();
Type *ScalarTy = Ext0->getType();
auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
@@ -333,8 +334,8 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
// Get cost estimates for the extract elements. These costs will factor into
// both sequences.
- unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
- unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
+ unsigned Ext0Index = Ext0IndexC->getZExtValue();
+ unsigned Ext1Index = Ext1IndexC->getZExtValue();
InstructionCost Extract0Cost =
TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);