aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/DependenceAnalysis.cpp376
-rw-r--r--llvm/lib/Analysis/InstructionSimplify.cpp19
-rw-r--r--llvm/lib/Analysis/ModuleSummaryAnalysis.cpp6
-rw-r--r--llvm/lib/Analysis/StackSafetyAnalysis.cpp4
-rw-r--r--llvm/lib/Bitcode/Writer/BitcodeWriter.cpp4
-rw-r--r--llvm/lib/CAS/OnDiskGraphDB.cpp3
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp60
-rw-r--r--llvm/lib/CodeGen/IfConversion.cpp2
-rw-r--r--llvm/lib/CodeGen/RegAllocFast.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp3
-rw-r--r--llvm/lib/DebugInfo/DWARF/DWARFDie.cpp49
-rw-r--r--llvm/lib/IR/AsmWriter.cpp2
-rw-r--r--llvm/lib/IR/Instructions.cpp2
-rw-r--r--llvm/lib/IR/ModuleSummaryIndex.cpp12
-rw-r--r--llvm/lib/LTO/LTO.cpp4
-rw-r--r--llvm/lib/LTO/LTOBackend.cpp4
-rw-r--r--llvm/lib/LTO/ThinLTOCodeGenerator.cpp15
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp73
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp51
-rw-r--r--llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp2
-rw-r--r--llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp71
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrVSX.td30
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp13
-rw-r--r--llvm/lib/Transforms/IPO/FunctionImport.cpp12
-rw-r--r--llvm/lib/Transforms/IPO/LowerTypeTests.cpp4
-rw-r--r--llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp6
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp54
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp7
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp117
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp1
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h12
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp132
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h22
39 files changed, 898 insertions, 318 deletions
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 805b682..853bd66 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -122,11 +122,61 @@ static cl::opt<unsigned> MIVMaxLevelThreshold(
cl::desc("Maximum depth allowed for the recursive algorithm used to "
"explore MIV direction vectors."));
-static cl::opt<bool> RunSIVRoutinesOnly(
- "da-run-siv-routines-only", cl::init(false), cl::ReallyHidden,
- cl::desc("Run only SIV routines and disable others (ZIV, RDIV, and MIV). "
- "The purpose is mainly to exclude the influence of those routines "
- "in regression tests for SIV routines."));
+namespace {
+
+/// Types of dependence test routines.
+enum class DependenceTestType {
+ All,
+ StrongSIV,
+ WeakCrossingSIV,
+ ExactSIV,
+ WeakZeroSIV,
+ ExactRDIV,
+ SymbolicRDIV,
+ GCDMIV,
+ BanerjeeMIV,
+};
+
+} // anonymous namespace
+
+static cl::opt<DependenceTestType> EnableDependenceTest(
+ "da-enable-dependence-test", cl::init(DependenceTestType::All),
+ cl::ReallyHidden,
+ cl::desc("Run only specified dependence test routine and disable others. "
+ "The purpose is mainly to exclude the influence of other "
+ "dependence test routines in regression tests. If set to All, all "
+ "dependence test routines are enabled."),
+ cl::values(clEnumValN(DependenceTestType::All, "all",
+ "Enable all dependence test routines."),
+ clEnumValN(DependenceTestType::StrongSIV, "strong-siv",
+ "Enable only Strong SIV test."),
+ clEnumValN(DependenceTestType::WeakCrossingSIV,
+ "weak-crossing-siv",
+ "Enable only Weak-Crossing SIV test."),
+ clEnumValN(DependenceTestType::ExactSIV, "exact-siv",
+ "Enable only Exact SIV test."),
+ clEnumValN(DependenceTestType::WeakZeroSIV, "weak-zero-siv",
+ "Enable only Weak-Zero SIV test."),
+ clEnumValN(DependenceTestType::ExactRDIV, "exact-rdiv",
+ "Enable only Exact RDIV test."),
+ clEnumValN(DependenceTestType::SymbolicRDIV, "symbolic-rdiv",
+ "Enable only Symbolic RDIV test."),
+ clEnumValN(DependenceTestType::GCDMIV, "gcd-miv",
+ "Enable only GCD MIV test."),
+ clEnumValN(DependenceTestType::BanerjeeMIV, "banerjee-miv",
+ "Enable only Banerjee MIV test.")));
+
+// TODO: This flag is disabled by default because it is still under development.
+// Enable it or delete this flag when the feature is ready.
+static cl::opt<bool> EnableMonotonicityCheck(
+ "da-enable-monotonicity-check", cl::init(false), cl::Hidden,
+ cl::desc("Check if the subscripts are monotonic. If it's not, dependence "
+ "is reported as unknown."));
+
+static cl::opt<bool> DumpMonotonicityReport(
+ "da-dump-monotonicity-report", cl::init(false), cl::Hidden,
+ cl::desc(
+ "When printing analysis, dump the results of monotonicity checks."));
//===----------------------------------------------------------------------===//
// basics
@@ -177,13 +227,196 @@ void DependenceAnalysisWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequiredTransitive<LoopInfoWrapperPass>();
}
+namespace {
+
+/// The property of monotonicity of a SCEV. To define the monotonicity, assume
+/// a SCEV defined within N-nested loops. Let i_k denote the iteration number
+/// of the k-th loop. Then we can regard the SCEV as an N-ary function:
+///
+/// F(i_1, i_2, ..., i_N)
+///
+/// The domain of i_k is the closed range [0, BTC_k], where BTC_k is the
+/// backedge-taken count of the k-th loop.
+///
+/// A function F is said to be "monotonically increasing with respect to the
+/// k-th loop" if x <= y implies the following condition:
+///
+/// F(i_1, ..., i_{k-1}, x, i_{k+1}, ..., i_N) <=
+/// F(i_1, ..., i_{k-1}, y, i_{k+1}, ..., i_N)
+///
+/// where i_1, ..., i_{k-1}, i_{k+1}, ..., i_N, x, and y are elements of their
+/// respective domains.
+///
+/// Likewise F is "monotonically decreasing with respect to the k-th loop"
+/// if x <= y implies
+///
+/// F(i_1, ..., i_{k-1}, x, i_{k+1}, ..., i_N) >=
+/// F(i_1, ..., i_{k-1}, y, i_{k+1}, ..., i_N)
+///
+/// A function F that is monotonically increasing or decreasing with respect to
+/// the k-th loop is simply called "monotonic with respect to k-th loop".
+///
+/// A function F is said to be "multivariate monotonic" when it is monotonic
+/// with respect to all of the N loops.
+///
+/// Since integer comparison can be either signed or unsigned, we need to
+/// distinguish monotonicity in the signed sense from that in the unsigned
+/// sense. Note that the inequality "x <= y" merely indicates loop progression
+/// and is not affected by the difference between signed and unsigned order.
+///
+/// Currently we only consider monotonicity in a signed sense.
+enum class SCEVMonotonicityType {
+ /// We don't know anything about the monotonicity of the SCEV.
+ Unknown,
+
+ /// The SCEV is loop-invariant with respect to the outermost loop. In other
+ /// words, the function F corresponding to the SCEV is a constant function.
+ Invariant,
+
+ /// The function F corresponding to the SCEV is multivariate monotonic in a
+ /// signed sense. Note that the multivariate monotonic function may also be a
+ /// constant function. The order employed in the definition of monotonicity
+ /// is not strict order.
+ MultivariateSignedMonotonic,
+};
+
+struct SCEVMonotonicity {
+ SCEVMonotonicity(SCEVMonotonicityType Type,
+ const SCEV *FailurePoint = nullptr);
+
+ SCEVMonotonicityType getType() const { return Type; }
+
+ const SCEV *getFailurePoint() const { return FailurePoint; }
+
+ bool isUnknown() const { return Type == SCEVMonotonicityType::Unknown; }
+
+ void print(raw_ostream &OS, unsigned Depth) const;
+
+private:
+ SCEVMonotonicityType Type;
+
+ /// The subexpression that caused Unknown. Mainly for debugging purpose.
+ const SCEV *FailurePoint;
+};
+
+/// Check the monotonicity of a SCEV. Since dependence tests (SIV, MIV, etc.)
+/// assume that subscript expressions are (multivariate) monotonic, we need to
+/// verify this property before applying those tests. Violating this assumption
+/// may cause them to produce incorrect results.
+struct SCEVMonotonicityChecker
+ : public SCEVVisitor<SCEVMonotonicityChecker, SCEVMonotonicity> {
+
+ SCEVMonotonicityChecker(ScalarEvolution *SE) : SE(SE) {}
+
+ /// Check the monotonicity of \p Expr. \p Expr must be integer type. If \p
+ /// OutermostLoop is not null, \p Expr must be defined in \p OutermostLoop or
+ /// one of its nested loops.
+ SCEVMonotonicity checkMonotonicity(const SCEV *Expr,
+ const Loop *OutermostLoop);
+
+private:
+ ScalarEvolution *SE;
+
+ /// The outermost loop that DA is analyzing.
+ const Loop *OutermostLoop;
+
+ /// A helper to classify \p Expr as either Invariant or Unknown.
+ SCEVMonotonicity invariantOrUnknown(const SCEV *Expr);
+
+ /// Return true if \p Expr is loop-invariant with respect to the outermost
+ /// loop.
+ bool isLoopInvariant(const SCEV *Expr) const;
+
+ /// A helper to create an Unknown SCEVMonotonicity.
+ SCEVMonotonicity createUnknown(const SCEV *FailurePoint) {
+ return SCEVMonotonicity(SCEVMonotonicityType::Unknown, FailurePoint);
+ }
+
+ SCEVMonotonicity visitAddRecExpr(const SCEVAddRecExpr *Expr);
+
+ SCEVMonotonicity visitConstant(const SCEVConstant *) {
+ return SCEVMonotonicity(SCEVMonotonicityType::Invariant);
+ }
+ SCEVMonotonicity visitVScale(const SCEVVScale *) {
+ return SCEVMonotonicity(SCEVMonotonicityType::Invariant);
+ }
+
+ // TODO: Handle more cases.
+ SCEVMonotonicity visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+ SCEVMonotonicity visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+ SCEVMonotonicity visitAddExpr(const SCEVAddExpr *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+ SCEVMonotonicity visitMulExpr(const SCEVMulExpr *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+ SCEVMonotonicity visitPtrToIntExpr(const SCEVPtrToIntExpr *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+ SCEVMonotonicity visitTruncateExpr(const SCEVTruncateExpr *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+ SCEVMonotonicity visitUDivExpr(const SCEVUDivExpr *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+ SCEVMonotonicity visitSMaxExpr(const SCEVSMaxExpr *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+ SCEVMonotonicity visitUMaxExpr(const SCEVUMaxExpr *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+ SCEVMonotonicity visitSMinExpr(const SCEVSMinExpr *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+ SCEVMonotonicity visitUMinExpr(const SCEVUMinExpr *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+ SCEVMonotonicity visitSequentialUMinExpr(const SCEVSequentialUMinExpr *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+ SCEVMonotonicity visitUnknown(const SCEVUnknown *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+ SCEVMonotonicity visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
+ return invariantOrUnknown(Expr);
+ }
+
+ friend struct SCEVVisitor<SCEVMonotonicityChecker, SCEVMonotonicity>;
+};
+
+} // anonymous namespace
+
// Used to test the dependence analyzer.
// Looks through the function, noting instructions that may access memory.
// Calls depends() on every possible pair and prints out the result.
// Ignores all other instructions.
static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
- ScalarEvolution &SE, bool NormalizeResults) {
+ ScalarEvolution &SE, LoopInfo &LI,
+ bool NormalizeResults) {
auto *F = DA->getFunction();
+
+ if (DumpMonotonicityReport) {
+ SCEVMonotonicityChecker Checker(&SE);
+ OS << "Monotonicity check:\n";
+ for (Instruction &Inst : instructions(F)) {
+ if (!isa<LoadInst>(Inst) && !isa<StoreInst>(Inst))
+ continue;
+ Value *Ptr = getLoadStorePointerOperand(&Inst);
+ const Loop *L = LI.getLoopFor(Inst.getParent());
+ const SCEV *PtrSCEV = SE.getSCEVAtScope(Ptr, L);
+ const SCEV *AccessFn = SE.removePointerBase(PtrSCEV);
+ SCEVMonotonicity Mon = Checker.checkMonotonicity(AccessFn, L);
+ OS.indent(2) << "Inst: " << Inst << "\n";
+ OS.indent(4) << "Expr: " << *AccessFn << "\n";
+ Mon.print(OS, 4);
+ }
+ OS << "\n";
+ }
+
for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F); SrcI != SrcE;
++SrcI) {
if (SrcI->mayReadOrWriteMemory()) {
@@ -235,7 +468,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
void DependenceAnalysisWrapperPass::print(raw_ostream &OS,
const Module *) const {
dumpExampleDependence(
- OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false);
+ OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(),
+ getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), false);
}
PreservedAnalyses
@@ -244,7 +478,7 @@ DependenceAnalysisPrinterPass::run(Function &F, FunctionAnalysisManager &FAM) {
<< "':\n";
dumpExampleDependence(OS, &FAM.getResult<DependenceAnalysis>(F),
FAM.getResult<ScalarEvolutionAnalysis>(F),
- NormalizeResults);
+ FAM.getResult<LoopAnalysis>(F), NormalizeResults);
return PreservedAnalyses::all();
}
@@ -671,6 +905,81 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
}
//===----------------------------------------------------------------------===//
+// SCEVMonotonicity
+
+SCEVMonotonicity::SCEVMonotonicity(SCEVMonotonicityType Type,
+ const SCEV *FailurePoint)
+ : Type(Type), FailurePoint(FailurePoint) {
+ assert(
+ ((Type == SCEVMonotonicityType::Unknown) == (FailurePoint != nullptr)) &&
+ "FailurePoint must be provided iff Type is Unknown");
+}
+
+void SCEVMonotonicity::print(raw_ostream &OS, unsigned Depth) const {
+ OS.indent(Depth) << "Monotonicity: ";
+ switch (Type) {
+ case SCEVMonotonicityType::Unknown:
+ assert(FailurePoint && "FailurePoint must be provided for Unknown");
+ OS << "Unknown\n";
+ OS.indent(Depth) << "Reason: " << *FailurePoint << "\n";
+ break;
+ case SCEVMonotonicityType::Invariant:
+ OS << "Invariant\n";
+ break;
+ case SCEVMonotonicityType::MultivariateSignedMonotonic:
+ OS << "MultivariateSignedMonotonic\n";
+ break;
+ }
+}
+
+bool SCEVMonotonicityChecker::isLoopInvariant(const SCEV *Expr) const {
+ return !OutermostLoop || SE->isLoopInvariant(Expr, OutermostLoop);
+}
+
+SCEVMonotonicity SCEVMonotonicityChecker::invariantOrUnknown(const SCEV *Expr) {
+ if (isLoopInvariant(Expr))
+ return SCEVMonotonicity(SCEVMonotonicityType::Invariant);
+ return createUnknown(Expr);
+}
+
+SCEVMonotonicity
+SCEVMonotonicityChecker::checkMonotonicity(const SCEV *Expr,
+ const Loop *OutermostLoop) {
+ assert(Expr->getType()->isIntegerTy() && "Expr must be integer type");
+ this->OutermostLoop = OutermostLoop;
+ return visit(Expr);
+}
+
+/// We only care about an affine AddRec at the moment. For an affine AddRec,
+/// the monotonicity can be inferred from its nowrap property. For example, let
+/// X and Y be loop-invariant, and assume Y is non-negative. An AddRec
+/// {X,+.Y}<nsw> implies:
+///
+/// X <=s (X + Y) <=s ((X + Y) + Y) <=s ...
+///
+/// Thus, we can conclude that the AddRec is monotonically increasing with
+/// respect to the associated loop in a signed sense. The similar reasoning
+/// applies when Y is non-positive, leading to a monotonically decreasing
+/// AddRec.
+SCEVMonotonicity
+SCEVMonotonicityChecker::visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+ if (!Expr->isAffine() || !Expr->hasNoSignedWrap())
+ return createUnknown(Expr);
+
+ const SCEV *Start = Expr->getStart();
+ const SCEV *Step = Expr->getStepRecurrence(*SE);
+
+ SCEVMonotonicity StartMon = visit(Start);
+ if (StartMon.isUnknown())
+ return StartMon;
+
+ if (!isLoopInvariant(Step))
+ return createUnknown(Expr);
+
+ return SCEVMonotonicity(SCEVMonotonicityType::MultivariateSignedMonotonic);
+}
+
+//===----------------------------------------------------------------------===//
// DependenceInfo methods
// For debugging purposes. Dumps a dependence to OS.
@@ -1273,6 +1582,13 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,
return nullptr;
}
+/// Returns true iff \p Test is enabled.
+static bool isDependenceTestEnabled(DependenceTestType Test) {
+ if (EnableDependenceTest == DependenceTestType::All)
+ return true;
+ return EnableDependenceTest == Test;
+}
+
// testZIV -
// When we have a pair of subscripts of the form [c1] and [c2],
// where c1 and c2 are both loop invariant, we attack it using
@@ -1334,6 +1650,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
const Loop *CurDstLoop, unsigned Level,
FullDependence &Result,
Constraint &NewConstraint) const {
+ if (!isDependenceTestEnabled(DependenceTestType::StrongSIV))
+ return false;
+
LLVM_DEBUG(dbgs() << "\tStrong SIV test\n");
LLVM_DEBUG(dbgs() << "\t Coeff = " << *Coeff);
LLVM_DEBUG(dbgs() << ", " << *Coeff->getType() << "\n");
@@ -1468,6 +1787,9 @@ bool DependenceInfo::weakCrossingSIVtest(
const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level,
FullDependence &Result, Constraint &NewConstraint,
const SCEV *&SplitIter) const {
+ if (!isDependenceTestEnabled(DependenceTestType::WeakCrossingSIV))
+ return false;
+
LLVM_DEBUG(dbgs() << "\tWeak-Crossing SIV test\n");
LLVM_DEBUG(dbgs() << "\t Coeff = " << *Coeff << "\n");
LLVM_DEBUG(dbgs() << "\t SrcConst = " << *SrcConst << "\n");
@@ -1726,6 +2048,9 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
const Loop *CurDstLoop, unsigned Level,
FullDependence &Result,
Constraint &NewConstraint) const {
+ if (!isDependenceTestEnabled(DependenceTestType::ExactSIV))
+ return false;
+
LLVM_DEBUG(dbgs() << "\tExact SIV test\n");
LLVM_DEBUG(dbgs() << "\t SrcCoeff = " << *SrcCoeff << " = AM\n");
LLVM_DEBUG(dbgs() << "\t DstCoeff = " << *DstCoeff << " = BM\n");
@@ -1905,6 +2230,9 @@ bool DependenceInfo::weakZeroSrcSIVtest(
const SCEV *DstCoeff, const SCEV *SrcConst, const SCEV *DstConst,
const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level,
FullDependence &Result, Constraint &NewConstraint) const {
+ if (!isDependenceTestEnabled(DependenceTestType::WeakZeroSIV))
+ return false;
+
// For the WeakSIV test, it's possible the loop isn't common to
// the Src and Dst loops. If it isn't, then there's no need to
// record a direction.
@@ -2013,6 +2341,9 @@ bool DependenceInfo::weakZeroDstSIVtest(
const SCEV *SrcCoeff, const SCEV *SrcConst, const SCEV *DstConst,
const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level,
FullDependence &Result, Constraint &NewConstraint) const {
+ if (!isDependenceTestEnabled(DependenceTestType::WeakZeroSIV))
+ return false;
+
// For the WeakSIV test, it's possible the loop isn't common to the
// Src and Dst loops. If it isn't, then there's no need to record a direction.
LLVM_DEBUG(dbgs() << "\tWeak-Zero (dst) SIV test\n");
@@ -2096,8 +2427,9 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
const SCEV *SrcConst, const SCEV *DstConst,
const Loop *SrcLoop, const Loop *DstLoop,
FullDependence &Result) const {
- if (RunSIVRoutinesOnly)
+ if (!isDependenceTestEnabled(DependenceTestType::ExactRDIV))
return false;
+
LLVM_DEBUG(dbgs() << "\tExact RDIV test\n");
LLVM_DEBUG(dbgs() << "\t SrcCoeff = " << *SrcCoeff << " = AM\n");
LLVM_DEBUG(dbgs() << "\t DstCoeff = " << *DstCoeff << " = BM\n");
@@ -2242,8 +2574,9 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
const SCEV *C1, const SCEV *C2,
const Loop *Loop1,
const Loop *Loop2) const {
- if (RunSIVRoutinesOnly)
+ if (!isDependenceTestEnabled(DependenceTestType::SymbolicRDIV))
return false;
+
++SymbolicRDIVapplications;
LLVM_DEBUG(dbgs() << "\ttry symbolic RDIV test\n");
LLVM_DEBUG(dbgs() << "\t A1 = " << *A1);
@@ -2557,8 +2890,9 @@ bool DependenceInfo::accumulateCoefficientsGCD(const SCEV *Expr,
// to "a common divisor".
bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
FullDependence &Result) const {
- if (RunSIVRoutinesOnly)
+ if (!isDependenceTestEnabled(DependenceTestType::GCDMIV))
return false;
+
LLVM_DEBUG(dbgs() << "starting gcd\n");
++GCDapplications;
unsigned BitWidth = SE->getTypeSizeInBits(Src->getType());
@@ -2725,8 +3059,9 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
const SmallBitVector &Loops,
FullDependence &Result) const {
- if (RunSIVRoutinesOnly)
+ if (!isDependenceTestEnabled(DependenceTestType::BanerjeeMIV))
return false;
+
LLVM_DEBUG(dbgs() << "starting Banerjee\n");
++BanerjeeApplications;
LLVM_DEBUG(dbgs() << " Src = " << *Src << '\n');
@@ -3488,10 +3823,19 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
// resize Pair to contain as many pairs of subscripts as the delinearization
// has found, and then initialize the pairs following the delinearization.
Pair.resize(Size);
+ SCEVMonotonicityChecker MonChecker(SE);
+ const Loop *OutermostLoop = SrcLoop ? SrcLoop->getOutermostLoop() : nullptr;
for (int I = 0; I < Size; ++I) {
Pair[I].Src = SrcSubscripts[I];
Pair[I].Dst = DstSubscripts[I];
unifySubscriptType(&Pair[I]);
+
+ if (EnableMonotonicityCheck) {
+ if (MonChecker.checkMonotonicity(Pair[I].Src, OutermostLoop).isUnknown())
+ return false;
+ if (MonChecker.checkMonotonicity(Pair[I].Dst, OutermostLoop).isUnknown())
+ return false;
+ }
}
return true;
@@ -3824,6 +4168,14 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
Pair[0].Src = SrcEv;
Pair[0].Dst = DstEv;
+ SCEVMonotonicityChecker MonChecker(SE);
+ const Loop *OutermostLoop = SrcLoop ? SrcLoop->getOutermostLoop() : nullptr;
+ if (EnableMonotonicityCheck)
+ if (MonChecker.checkMonotonicity(Pair[0].Src, OutermostLoop).isUnknown() ||
+ MonChecker.checkMonotonicity(Pair[0].Dst, OutermostLoop).isUnknown())
+ return std::make_unique<Dependence>(Src, Dst,
+ SCEVUnionPredicate(Assume, *SE));
+
if (Delinearize) {
if (tryDelinearize(Src, Dst, Pair)) {
LLVM_DEBUG(dbgs() << " delinearized\n");
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index dc813f6..8da51d0 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5106,32 +5106,33 @@ static Value *simplifyGEPInst(Type *SrcTy, Value *Ptr,
return Ptr;
// The following transforms are only safe if the ptrtoint cast
- // doesn't truncate the pointers.
- if (Indices[0]->getType()->getScalarSizeInBits() ==
- Q.DL.getPointerSizeInBits(AS)) {
+ // doesn't truncate the address of the pointers. The non-address bits
+ // must be the same, as the underlying objects are the same.
+ if (Indices[0]->getType()->getScalarSizeInBits() >=
+ Q.DL.getAddressSizeInBits(AS)) {
auto CanSimplify = [GEPTy, &P, Ptr]() -> bool {
return P->getType() == GEPTy &&
getUnderlyingObject(P) == getUnderlyingObject(Ptr);
};
// getelementptr V, (sub P, V) -> P if P points to a type of size 1.
if (TyAllocSize == 1 &&
- match(Indices[0],
- m_Sub(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Specific(Ptr)))) &&
+ match(Indices[0], m_Sub(m_PtrToIntOrAddr(m_Value(P)),
+ m_PtrToIntOrAddr(m_Specific(Ptr)))) &&
CanSimplify())
return P;
// getelementptr V, (ashr (sub P, V), C) -> P if P points to a type of
// size 1 << C.
- if (match(Indices[0], m_AShr(m_Sub(m_PtrToInt(m_Value(P)),
- m_PtrToInt(m_Specific(Ptr))),
+ if (match(Indices[0], m_AShr(m_Sub(m_PtrToIntOrAddr(m_Value(P)),
+ m_PtrToIntOrAddr(m_Specific(Ptr))),
m_ConstantInt(C))) &&
TyAllocSize == 1ULL << C && CanSimplify())
return P;
// getelementptr V, (sdiv (sub P, V), C) -> P if P points to a type of
// size C.
- if (match(Indices[0], m_SDiv(m_Sub(m_PtrToInt(m_Value(P)),
- m_PtrToInt(m_Specific(Ptr))),
+ if (match(Indices[0], m_SDiv(m_Sub(m_PtrToIntOrAddr(m_Value(P)),
+ m_PtrToIntOrAddr(m_Specific(Ptr))),
m_SpecificInt(TyAllocSize))) &&
CanSimplify())
return P;
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index a60a4bb..6cdf51a 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -1100,12 +1100,12 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
for (auto &GlobalList : Index) {
// Ignore entries for references that are undefined in the current module.
- if (GlobalList.second.SummaryList.empty())
+ if (GlobalList.second.getSummaryList().empty())
continue;
- assert(GlobalList.second.SummaryList.size() == 1 &&
+ assert(GlobalList.second.getSummaryList().size() == 1 &&
"Expected module's index to have one summary per GUID");
- auto &Summary = GlobalList.second.SummaryList[0];
+ auto &Summary = GlobalList.second.getSummaryList()[0];
if (!IsThinLTO) {
Summary->setNotEligibleToImport();
continue;
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 5e94e0b..5e92ca1 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -1136,7 +1136,7 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) {
if (!AreStatisticsEnabled())
return;
for (auto &GVS : Index)
- for (auto &GV : GVS.second.SummaryList)
+ for (auto &GV : GVS.second.getSummaryList())
if (FunctionSummary *FS = dyn_cast<FunctionSummary>(GV.get()))
Stat += FS->paramAccesses().size();
};
@@ -1147,7 +1147,7 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) {
// Convert the ModuleSummaryIndex to a FunctionMap
for (auto &GVS : Index) {
- for (auto &GV : GVS.second.SummaryList) {
+ for (auto &GV : GVS.second.getSummaryList()) {
FunctionSummary *FS = dyn_cast<FunctionSummary>(GV.get());
if (!FS || FS->paramAccesses().empty())
continue;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 8ff3aa9..61aa7c2f5 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -235,7 +235,7 @@ public:
return;
for (const auto &GUIDSummaryLists : *Index)
// Examine all summaries for this GUID.
- for (auto &Summary : GUIDSummaryLists.second.SummaryList)
+ for (auto &Summary : GUIDSummaryLists.second.getSummaryList())
if (auto FS = dyn_cast<FunctionSummary>(Summary.get())) {
// For each call in the function summary, see if the call
// is to a GUID (which means it is for an indirect call,
@@ -587,7 +587,7 @@ public:
}
} else {
for (auto &Summaries : Index)
- for (auto &Summary : Summaries.second.SummaryList)
+ for (auto &Summary : Summaries.second.getSummaryList())
Callback({Summaries.first, Summary.get()}, false);
}
}
diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp
index 72bb98c..64cbe9d 100644
--- a/llvm/lib/CAS/OnDiskGraphDB.cpp
+++ b/llvm/lib/CAS/OnDiskGraphDB.cpp
@@ -836,6 +836,7 @@ uint64_t DataRecordHandle::getDataSize() const {
case DataSizeFlags::Uses8B:
return support::endian::read64le(DataSizePtr);
}
+ llvm_unreachable("Unknown DataSizeFlags enum");
}
void DataRecordHandle::skipDataSize(LayoutFlags LF, int64_t &RelOffset) const {
@@ -863,6 +864,7 @@ uint32_t DataRecordHandle::getNumRefs() const {
case NumRefsFlags::Uses8B:
return support::endian::read64le(NumRefsPtr);
}
+ llvm_unreachable("Unknown NumRefsFlags enum");
}
void DataRecordHandle::skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const {
@@ -1270,6 +1272,7 @@ Expected<bool> OnDiskGraphDB::isMaterialized(ObjectID Ref) {
return FaultInResult.takeError();
return true;
}
+ llvm_unreachable("Unknown ObjectPresence enum");
}
Expected<OnDiskGraphDB::ObjectPresence>
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index f0f0861..c7d45897 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -566,32 +566,54 @@ bool DwarfExpression::addExpression(
case dwarf::DW_OP_LLVM_extract_bits_zext: {
unsigned SizeInBits = Op->getArg(1);
unsigned BitOffset = Op->getArg(0);
+ unsigned DerefSize = 0;
+ // Operations are done in the DWARF "generic type" whose size
+ // is the size of a pointer.
+ unsigned PtrSizeInBytes = CU.getAsmPrinter()->MAI->getCodePointerSize();
// If we have a memory location then dereference to get the value, though
// we have to make sure we don't dereference any bytes past the end of the
// object.
if (isMemoryLocation()) {
- emitOp(dwarf::DW_OP_deref_size);
- emitUnsigned(alignTo(BitOffset + SizeInBits, 8) / 8);
+ DerefSize = alignTo(BitOffset + SizeInBits, 8) / 8;
+ if (DerefSize == PtrSizeInBytes) {
+ emitOp(dwarf::DW_OP_deref);
+ } else {
+ emitOp(dwarf::DW_OP_deref_size);
+ emitUnsigned(DerefSize);
+ }
}
- // Extract the bits by a shift left (to shift out the bits after what we
- // want to extract) followed by shift right (to shift the bits to position
- // 0 and also sign/zero extend). These operations are done in the DWARF
- // "generic type" whose size is the size of a pointer.
- unsigned PtrSizeInBytes = CU.getAsmPrinter()->MAI->getCodePointerSize();
- unsigned LeftShift = PtrSizeInBytes * 8 - (SizeInBits + BitOffset);
- unsigned RightShift = LeftShift + BitOffset;
- if (LeftShift) {
- emitOp(dwarf::DW_OP_constu);
- emitUnsigned(LeftShift);
- emitOp(dwarf::DW_OP_shl);
- }
- if (RightShift) {
- emitOp(dwarf::DW_OP_constu);
- emitUnsigned(RightShift);
- emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext ? dwarf::DW_OP_shra
- : dwarf::DW_OP_shr);
+ // If a dereference was emitted for an unsigned value, and
+ // there's no bit offset, then a bit of optimization is
+ // possible.
+ if (OpNum == dwarf::DW_OP_LLVM_extract_bits_zext && BitOffset == 0) {
+ if (8 * DerefSize == SizeInBits) {
+ // The correct value is already on the stack.
+ } else {
+ // No need to shift, we can just mask off the desired bits.
+ emitOp(dwarf::DW_OP_constu);
+ emitUnsigned((1u << SizeInBits) - 1);
+ emitOp(dwarf::DW_OP_and);
+ }
+ } else {
+ // Extract the bits by a shift left (to shift out the bits after what we
+ // want to extract) followed by shift right (to shift the bits to
+ // position 0 and also sign/zero extend).
+ unsigned LeftShift = PtrSizeInBytes * 8 - (SizeInBits + BitOffset);
+ unsigned RightShift = LeftShift + BitOffset;
+ if (LeftShift) {
+ emitOp(dwarf::DW_OP_constu);
+ emitUnsigned(LeftShift);
+ emitOp(dwarf::DW_OP_shl);
+ }
+ if (RightShift) {
+ emitOp(dwarf::DW_OP_constu);
+ emitUnsigned(RightShift);
+ emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext
+ ? dwarf::DW_OP_shra
+ : dwarf::DW_OP_shr);
+ }
}
// The value is now at the top of the stack, so set the location to
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
index f80e1e8..3ac6d2a 100644
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -1498,7 +1498,7 @@ static void UpdatePredRedefs(MachineInstr &MI, LivePhysRegs &Redefs) {
// Before stepping forward past MI, remember which regs were live
// before MI. This is needed to set the Undef flag only when reg is
// dead.
- SparseSet<MCPhysReg, identity<MCPhysReg>> LiveBeforeMI;
+ SparseSet<MCPhysReg, MCPhysReg> LiveBeforeMI;
LiveBeforeMI.setUniverse(TRI->getNumRegs());
for (unsigned Reg : Redefs)
LiveBeforeMI.insert(Reg);
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 804480c..72b364c 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -211,7 +211,7 @@ private:
unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); }
};
- using LiveRegMap = SparseSet<LiveReg, identity<unsigned>, uint16_t>;
+ using LiveRegMap = SparseSet<LiveReg, unsigned, identity_cxx20, uint16_t>;
/// This map contains entries for each virtual register that is currently
/// available in a physical register.
LiveRegMap LiveVirtRegs;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6bf9008..310d35d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16433,7 +16433,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
case ISD::OR:
case ISD::XOR:
if (!LegalOperations && N0.hasOneUse() &&
- (isConstantOrConstantVector(N0.getOperand(0), true) ||
+ (N0.getOperand(0) == N0.getOperand(1) ||
+ isConstantOrConstantVector(N0.getOperand(0), true) ||
isConstantOrConstantVector(N0.getOperand(1), true))) {
// TODO: We already restricted this to pre-legalization, but for vectors
// we are extra cautious to not create an unsupported operation.
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 212a0c0..db5cc37 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -107,6 +107,28 @@ static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) {
return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference();
}
+static llvm::StringRef
+prettyLanguageVersionString(const DWARFAttribute &AttrValue,
+ const DWARFDie &Die) {
+ if (AttrValue.Attr != DW_AT_language_version)
+ return {};
+
+ auto NameForm = Die.find(DW_AT_language_name);
+ if (!NameForm)
+ return {};
+
+ auto LName = NameForm->getAsUnsignedConstant();
+ if (!LName)
+ return {};
+
+ auto LVersion = AttrValue.Value.getAsUnsignedConstant();
+ if (!LVersion)
+ return {};
+
+ return llvm::dwarf::LanguageDescription(
+ static_cast<SourceLanguageName>(*LName), *LVersion);
+}
+
static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
const DWARFAttribute &AttrValue, unsigned Indent,
DIDumpOptions DumpOpts) {
@@ -146,15 +168,28 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
} else if (std::optional<uint64_t> Val = FormValue.getAsUnsignedConstant())
Name = AttributeValueString(Attr, *Val);
- if (!Name.empty())
- WithColor(OS, Color) << Name;
- else if (Attr == DW_AT_decl_line || Attr == DW_AT_decl_column ||
- Attr == DW_AT_call_line || Attr == DW_AT_call_column ||
- Attr == DW_AT_language_version) {
+ auto DumpUnsignedConstant = [&OS,
+ &DumpOpts](const DWARFFormValue &FormValue) {
if (std::optional<uint64_t> Val = FormValue.getAsUnsignedConstant())
OS << *Val;
else
FormValue.dump(OS, DumpOpts);
+ };
+
+ llvm::StringRef PrettyVersionName =
+ prettyLanguageVersionString(AttrValue, Die);
+ bool ShouldDumpRawLanguageVersion =
+ Attr == DW_AT_language_version &&
+ (DumpOpts.Verbose || PrettyVersionName.empty());
+
+ if (!Name.empty())
+ WithColor(OS, Color) << Name;
+ else if (Attr == DW_AT_decl_line || Attr == DW_AT_decl_column ||
+ Attr == DW_AT_call_line || Attr == DW_AT_call_column) {
+ DumpUnsignedConstant(FormValue);
+ } else if (Attr == DW_AT_language_version) {
+ if (ShouldDumpRawLanguageVersion)
+ DumpUnsignedConstant(FormValue);
} else if (Attr == DW_AT_low_pc &&
(FormValue.getAsAddress() ==
dwarf::computeTombstoneAddress(U->getAddressByteSize()))) {
@@ -226,6 +261,10 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
DumpOpts.RecoverableErrorHandler(createStringError(
errc::invalid_argument, "decoding address ranges: %s",
toString(RangesOrError.takeError()).c_str()));
+ } else if (Attr == DW_AT_language_version) {
+ if (!PrettyVersionName.empty())
+ WithColor(OS, Color) << (ShouldDumpRawLanguageVersion ? " " : "")
+ << PrettyVersionName;
}
OS << ")\n";
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 3908a78..488b078 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -3196,7 +3196,7 @@ void AssemblyWriter::printModuleSummaryIndex() {
// for aliasee (then update BitcodeWriter.cpp and remove get/setAliaseeGUID).
for (auto &GlobalList : *TheIndex) {
auto GUID = GlobalList.first;
- for (auto &Summary : GlobalList.second.SummaryList)
+ for (auto &Summary : GlobalList.second.getSummaryList())
SummaryToGUIDMap[Summary.get()] = GUID;
}
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 9060a89..3b8fde8 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2878,7 +2878,7 @@ unsigned CastInst::isEliminableCastPair(Instruction::CastOps firstOp,
{ 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // FPTrunc |
{ 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt |
{ 1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt |
- { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr |
+ { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr |
{ 99,99,99,99,99,99,99,99,99,11,11,99,15, 0}, // IntToPtr |
{ 5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast |
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index dc55b63..a6353664 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -162,7 +162,7 @@ void ModuleSummaryIndex::collectDefinedFunctionsForModule(
StringRef ModulePath, GVSummaryMapTy &GVSummaryMap) const {
for (auto &GlobalList : *this) {
auto GUID = GlobalList.first;
- for (auto &GlobSummary : GlobalList.second.SummaryList) {
+ for (auto &GlobSummary : GlobalList.second.getSummaryList()) {
auto *Summary = dyn_cast_or_null<FunctionSummary>(GlobSummary.get());
if (!Summary)
// Ignore global variable, focus on functions
@@ -263,7 +263,7 @@ void ModuleSummaryIndex::propagateAttributes(
DenseSet<ValueInfo> MarkedNonReadWriteOnly;
for (auto &P : *this) {
bool IsDSOLocal = true;
- for (auto &S : P.second.SummaryList) {
+ for (auto &S : P.second.getSummaryList()) {
if (!isGlobalValueLive(S.get())) {
// computeDeadSymbolsAndUpdateIndirectCalls should have marked all
// copies live. Note that it is possible that there is a GUID collision
@@ -273,7 +273,7 @@ void ModuleSummaryIndex::propagateAttributes(
// all copies live we can assert here that all are dead if any copy is
// dead.
assert(llvm::none_of(
- P.second.SummaryList,
+ P.second.getSummaryList(),
[&](const std::unique_ptr<GlobalValueSummary> &Summary) {
return isGlobalValueLive(Summary.get());
}));
@@ -308,16 +308,16 @@ void ModuleSummaryIndex::propagateAttributes(
// Mark the flag in all summaries false so that we can do quick check
// without going through the whole list.
for (const std::unique_ptr<GlobalValueSummary> &Summary :
- P.second.SummaryList)
+ P.second.getSummaryList())
Summary->setDSOLocal(false);
}
setWithAttributePropagation();
setWithDSOLocalPropagation();
if (llvm::AreStatisticsEnabled())
for (auto &P : *this)
- if (P.second.SummaryList.size())
+ if (P.second.getSummaryList().size())
if (auto *GVS = dyn_cast<GlobalVarSummary>(
- P.second.SummaryList[0]->getBaseObject()))
+ P.second.getSummaryList()[0]->getBaseObject()))
if (isGlobalValueLive(GVS)) {
if (GVS->maybeReadOnly())
ReadOnlyLiveGVars++;
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index aec8891..cbc0b1d 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -457,7 +457,7 @@ void llvm::thinLTOResolvePrevailingInIndex(
// when needed.
DenseSet<GlobalValueSummary *> GlobalInvolvedWithAlias;
for (auto &I : Index)
- for (auto &S : I.second.SummaryList)
+ for (auto &S : I.second.getSummaryList())
if (auto AS = dyn_cast<AliasSummary>(S.get()))
GlobalInvolvedWithAlias.insert(&AS->getAliasee());
@@ -1182,7 +1182,7 @@ Error LTO::checkPartiallySplit() {
// Otherwise check if there are any recorded in the combined summary from the
// ThinLTO modules.
for (auto &P : ThinLTO.CombinedIndex) {
- for (auto &S : P.second.SummaryList) {
+ for (auto &S : P.second.getSummaryList()) {
auto *FS = dyn_cast<FunctionSummary>(S.get());
if (!FS)
continue;
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 280c3d1..93118be 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -770,11 +770,11 @@ bool lto::initImportList(const Module &M,
// via a WriteIndexesThinBackend.
for (const auto &GlobalList : CombinedIndex) {
// Ignore entries for undefined references.
- if (GlobalList.second.SummaryList.empty())
+ if (GlobalList.second.getSummaryList().empty())
continue;
auto GUID = GlobalList.first;
- for (const auto &Summary : GlobalList.second.SummaryList) {
+ for (const auto &Summary : GlobalList.second.getSummaryList()) {
// Skip the summaries for the importing module. These are included to
// e.g. record required linkage changes.
if (Summary->modulePath() == M.getModuleIdentifier())
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 5b333cd..ff94c54 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -101,8 +101,8 @@ static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
}
-static const GlobalValueSummary *
-getFirstDefinitionForLinker(const GlobalValueSummaryList &GVSummaryList) {
+static const GlobalValueSummary *getFirstDefinitionForLinker(
+ ArrayRef<std::unique_ptr<GlobalValueSummary>> GVSummaryList) {
// If there is any strong definition anywhere, get it.
auto StrongDefForLinker = llvm::find_if(
GVSummaryList, [](const std::unique_ptr<GlobalValueSummary> &Summary) {
@@ -131,14 +131,15 @@ getFirstDefinitionForLinker(const GlobalValueSummaryList &GVSummaryList) {
static void computePrevailingCopies(
const ModuleSummaryIndex &Index,
DenseMap<GlobalValue::GUID, const GlobalValueSummary *> &PrevailingCopy) {
- auto HasMultipleCopies = [&](const GlobalValueSummaryList &GVSummaryList) {
- return GVSummaryList.size() > 1;
- };
+ auto HasMultipleCopies =
+ [&](ArrayRef<std::unique_ptr<GlobalValueSummary>> GVSummaryList) {
+ return GVSummaryList.size() > 1;
+ };
for (auto &I : Index) {
- if (HasMultipleCopies(I.second.SummaryList))
+ if (HasMultipleCopies(I.second.getSummaryList()))
PrevailingCopy[I.first] =
- getFirstDefinitionForLinker(I.second.SummaryList);
+ getFirstDefinitionForLinker(I.second.getSummaryList());
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 662d84b..a81de5c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27602,6 +27602,15 @@ static SDValue performPTestFirstCombine(SDNode *N,
static SDValue
performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
+ SDLoc DL(N);
+
+ // If a DUP(Op0) already exists, reuse it for the scalar_to_vector.
+ if (DCI.isAfterLegalizeDAG()) {
+ if (SDNode *LN = DCI.DAG.getNodeIfExists(AArch64ISD::DUP, N->getVTList(),
+ N->getOperand(0)))
+ return SDValue(LN, 0);
+ }
+
// Let's do below transform.
//
// t34: v4i32 = AArch64ISD::UADDLV t2
@@ -27638,7 +27647,6 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
// Let's generate new sequence with AArch64ISD::NVCAST.
- SDLoc DL(N);
SDValue EXTRACT_SUBVEC =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
DAG.getConstant(0, DL, MVT::i64));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
new file mode 100644
index 0000000..30a1f05
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
@@ -0,0 +1,73 @@
+//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to add latency to
+/// barrier edges between ATOMIC_FENCE instructions and preceding
+/// memory accesses potentially affected by the fence.
+/// This encourages the scheduling of more instructions before
+/// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may
+/// introduce wait counting or indicate an impending S_BARRIER
+/// wait. Having more instructions in-flight across these
+/// constructs improves latency hiding.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUBarrierLatency.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+using namespace llvm;
+
+namespace {
+
+class BarrierLatency : public ScheduleDAGMutation {
+public:
+ BarrierLatency() = default;
+ void apply(ScheduleDAGInstrs *DAG) override;
+};
+
+void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
+ constexpr unsigned SyntheticLatency = 2000;
+ for (SUnit &SU : DAG->SUnits) {
+ const MachineInstr *MI = SU.getInstr();
+ if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
+ continue;
+
+ // Update latency on barrier edges of ATOMIC_FENCE.
+ // We don't consider the scope of the fence or type of instruction
+ // involved in the barrier edge.
+ for (SDep &PredDep : SU.Preds) {
+ if (!PredDep.isBarrier())
+ continue;
+ SUnit *PredSU = PredDep.getSUnit();
+ MachineInstr *MI = PredSU->getInstr();
+ // Only consider memory loads
+ if (!MI->mayLoad() || MI->mayStore())
+ continue;
+ SDep ForwardD = PredDep;
+ ForwardD.setSUnit(&SU);
+ for (SDep &SuccDep : PredSU->Succs) {
+ if (SuccDep == ForwardD) {
+ SuccDep.setLatency(SuccDep.getLatency() + SyntheticLatency);
+ break;
+ }
+ }
+ PredDep.setLatency(PredDep.getLatency() + SyntheticLatency);
+ PredSU->setDepthDirty();
+ SU.setDepthDirty();
+ }
+ }
+}
+
+} // end namespace
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createAMDGPUBarrierLatencyDAGMutation() {
+ return std::make_unique<BarrierLatency>();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
new file mode 100644
index 0000000..c23f0b9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
@@ -0,0 +1,21 @@
+//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4958a20..996b55f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUTargetMachine.h"
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
+#include "AMDGPUBarrierLatency.h"
#include "AMDGPUCtorDtorLowering.h"
#include "AMDGPUExportClustering.h"
#include "AMDGPUExportKernelRuntimeHandles.h"
@@ -639,6 +640,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
return DAG;
}
@@ -659,6 +661,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
return DAG;
}
@@ -1197,6 +1200,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
EnableVOPD)
DAG->addMutation(createVOPDPairingMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
return DAG;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 13f727b68..a1e0e52 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -52,6 +52,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUAsmPrinter.cpp
AMDGPUAtomicOptimizer.cpp
AMDGPUAttributor.cpp
+ AMDGPUBarrierLatency.cpp
AMDGPUCallLowering.cpp
AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 74d4153..6f1feb1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2223,8 +2223,8 @@ def : GCNPat <
def : GCNPat <
(DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)),
- (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src,
- 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0),
+ (V_PK_ADD_F32 !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), VReg_64:$src,
+ !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), (i64 0),
0, 0, 0, 0, 0)
> {
let SubtargetPredicate = HasPackedFP32Ops;
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index bdbc000..07264d9 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -397,12 +397,6 @@ public:
bool IsCrossAddrSpaceOrdering,
Position Pos) const = 0;
- /// Inserts any necessary instructions before the barrier start instruction
- /// \p MI in order to support pairing of barriers and fences.
- virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
- return false;
- };
-
/// Virtual destructor to allow derivations to be deleted.
virtual ~SICacheControl() = default;
};
@@ -583,12 +577,8 @@ public:
bool IsCrossAddrSpaceOrdering, Position Pos,
AtomicOrdering Order, bool AtomicsOnly) const override;
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
-
- bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
+ bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, Position Pos) const override;
};
class SIGfx11CacheControl : public SIGfx10CacheControl {
@@ -2069,8 +2059,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// the WGP. Therefore need to wait for operations to complete to ensure
// they are visible to waves in the other CU as the L0 is per CU.
// Otherwise in CU mode and all waves of a work-group are on the same CU
- // which shares the same L0.
- if (!ST.isCuModeEnabled()) {
+ // which shares the same L0. Note that we still need to wait when
+ // performing a release in this mode to respect the transitivity of
+ // happens-before, e.g. other waves of the workgroup must be able to
+ // release the memory from another wave at a wider scope.
+ if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
VMCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2225,22 +2218,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}
-bool SIGfx10CacheControl::insertBarrierStart(
- MachineBasicBlock::iterator &MI) const {
- // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
- // mode. This is because a CU mode release fence does not emit any wait, which
- // is fine when only dealing with vmem, but isn't sufficient in the presence
- // of barriers which do not go through vmem.
- // GFX12.5 does not require this additional wait.
- if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
- return false;
-
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
- return true;
-}
-
bool SIGfx11CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
@@ -2419,15 +2396,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// In WGP mode the waves of a work-group can be executing on either CU
// of the WGP. Therefore need to wait for operations to complete to
// ensure they are visible to waves in the other CU as the L0 is per CU.
+ //
// Otherwise in CU mode and all waves of a work-group are on the same CU
- // which shares the same L0.
+ // which shares the same L0. Note that we still need to wait when
+ // performing a release in this mode to respect the transitivity of
+ // happens-before, e.g. other waves of the workgroup must be able to
+ // release the memory from another wave at a wider scope.
//
// GFX12.5:
// CU$ has two ports. To ensure operations are visible at the workgroup
// level, we need to ensure all operations in this port have completed
// so the other SIMDs in the WG can see them. There is no ordering
// guarantee between the ports.
- if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
+ if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
+ isReleaseOrStronger(Order)) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
LOADCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -3017,11 +2999,6 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
MI = II->getIterator();
}
- if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
- Changed |= CC->insertBarrierStart(MI);
- continue;
- }
-
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
continue;
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 96ee69c..406f4c1 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -882,7 +882,7 @@ static bool producesFalseLanesZero(MachineInstr &MI,
continue;
// Skip the lr predicate reg
int PIdx = llvm::findFirstVPTPredOperandIdx(MI);
- if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2)
+ if (PIdx != -1 && MO.getOperandNo() == PIdx + ARM::SUBOP_vpred_n_tp_reg)
continue;
// Check that this instruction will produce zeros in its false lanes:
diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index 5eeb4fe..413e844 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -534,7 +534,7 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
Register LR = LoopPhi->getOperand(0).getReg();
for (MachineInstr *MI : MVEInstrs) {
int Idx = findFirstVPTPredOperandIdx(*MI);
- MI->getOperand(Idx + 2).setReg(LR);
+ MI->getOperand(Idx + ARM::SUBOP_vpred_n_tp_reg).setReg(LR);
}
}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8bf0d11..d477522 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -442,14 +442,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
// If we're enabling GP optimizations, use hardware square root
- if (!Subtarget.hasFSQRT() &&
- !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
- Subtarget.hasFRE()))
+ if (!Subtarget.hasFSQRT() && !(Subtarget.hasFRSQRTE() && Subtarget.hasFRE()))
setOperationAction(ISD::FSQRT, MVT::f64, Expand);
if (!Subtarget.hasFSQRT() &&
- !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
- Subtarget.hasFRES()))
+ !(Subtarget.hasFRSQRTES() && Subtarget.hasFRES()))
setOperationAction(ISD::FSQRT, MVT::f32, Expand);
if (Subtarget.hasFCPSGN()) {
@@ -565,16 +562,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::i32, Legal);
setOperationAction(ISD::BITCAST, MVT::i64, Legal);
setOperationAction(ISD::BITCAST, MVT::f64, Legal);
- if (TM.Options.UnsafeFPMath) {
- setOperationAction(ISD::LRINT, MVT::f64, Legal);
- setOperationAction(ISD::LRINT, MVT::f32, Legal);
- setOperationAction(ISD::LLRINT, MVT::f64, Legal);
- setOperationAction(ISD::LLRINT, MVT::f32, Legal);
- setOperationAction(ISD::LROUND, MVT::f64, Legal);
- setOperationAction(ISD::LROUND, MVT::f32, Legal);
- setOperationAction(ISD::LLROUND, MVT::f64, Legal);
- setOperationAction(ISD::LLROUND, MVT::f32, Legal);
- }
+
+ setOperationAction(ISD::STRICT_LRINT, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_LRINT, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_LLRINT, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_LLRINT, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_LROUND, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_LROUND, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_LLROUND, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_LLROUND, MVT::f32, Custom);
} else {
setOperationAction(ISD::BITCAST, MVT::f32, Expand);
setOperationAction(ISD::BITCAST, MVT::i32, Expand);
@@ -1034,11 +1030,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
// The nearbyint variants are not allowed to raise the inexact exception
- // so we can only code-gen them with unsafe math.
- if (TM.Options.UnsafeFPMath) {
- setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
- }
+ // so we can only code-gen them with fpexcept.ignore.
+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f32, Custom);
setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
@@ -8911,11 +8905,13 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
// be lost at this stage, but is below the single-precision rounding
// position.
//
- // However, if -enable-unsafe-fp-math is in effect, accept double
+ // However, if afn is in effect, accept double
// rounding to avoid the extra overhead.
- if (Op.getValueType() == MVT::f32 &&
- !Subtarget.hasFPCVT() &&
- !DAG.getTarget().Options.UnsafeFPMath) {
+ // FIXME: Currently INT_TO_FP can't support fast math flags because
+ // of nneg flag, thus Op->getFlags().hasApproximateFuncs() is always
+ // false.
+ if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT() &&
+ !Op->getFlags().hasApproximateFuncs()) {
// Twiddle input to make sure the low 11 bits are zero. (If this
// is the case, we are guaranteed the value will fit into the 53 bit
@@ -12759,6 +12755,14 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerADDSUBO_CARRY(Op, DAG);
case ISD::UCMP:
return LowerUCMP(Op, DAG);
+ case ISD::STRICT_LRINT:
+ case ISD::STRICT_LLRINT:
+ case ISD::STRICT_LROUND:
+ case ISD::STRICT_LLROUND:
+ case ISD::STRICT_FNEARBYINT:
+ if (Op->getFlags().hasNoFPExcept())
+ return Op;
+ return SDValue();
}
}
@@ -13088,7 +13092,9 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
BuildMI(BB, dl, TII->get(StoreMnemonic))
.addReg(TmpReg).addReg(ptrA).addReg(ptrB);
BuildMI(BB, dl, TII->get(PPC::BCC))
- .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+ .addImm(PPC::PRED_NE_MINUS)
+ .addReg(PPC::CR0)
+ .addMBB(loopMBB);
BB->addSuccessor(loopMBB);
BB->addSuccessor(exitMBB);
@@ -13346,7 +13352,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
.addReg(ZeroReg)
.addReg(PtrReg);
BuildMI(BB, dl, TII->get(PPC::BCC))
- .addImm(PPC::PRED_NE)
+ .addImm(PPC::PRED_NE_MINUS)
.addReg(PPC::CR0)
.addMBB(loopMBB);
BB->addSuccessor(loopMBB);
@@ -14177,7 +14183,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addReg(dest)
.addReg(oldval);
BuildMI(BB, dl, TII->get(PPC::BCC))
- .addImm(PPC::PRED_NE)
+ .addImm(PPC::PRED_NE_MINUS)
.addReg(CrReg)
.addMBB(exitMBB);
BB->addSuccessor(loop2MBB);
@@ -14189,7 +14195,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addReg(ptrA)
.addReg(ptrB);
BuildMI(BB, dl, TII->get(PPC::BCC))
- .addImm(PPC::PRED_NE)
+ .addImm(PPC::PRED_NE_MINUS)
.addReg(PPC::CR0)
.addMBB(loop1MBB);
BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
@@ -18707,11 +18713,12 @@ bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
const Function *F = I->getFunction();
const DataLayout &DL = F->getDataLayout();
Type *Ty = User->getOperand(0)->getType();
+ bool AllowContract = I->getFastMathFlags().allowContract() &&
+ User->getFastMathFlags().allowContract();
- return !(
- isFMAFasterThanFMulAndFAdd(*F, Ty) &&
- isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
- (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
+ return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
+ isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
+ (AllowContract || Options.AllowFPOpFusion == FPOpFusion::Fast));
}
case Instruction::Load: {
// Don't break "store (load float*)" pattern, this pattern will be combined
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 979ba31..885bed6 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2786,14 +2786,16 @@ def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>;
// Rounding without exceptions (nearbyint). Due to strange tblgen behaviour,
// these need to be defined after the any_frint versions so ISEL will correctly
// add the chain to the strict versions.
-def : Pat<(f32 (fnearbyint f32:$S)),
+// TODO: Match strict fp rounding intrinsics with instructions like xsrdpiz when
+// rounding mode is propagated to CodeGen part.
+def : Pat<(f32 (strict_fnearbyint f32:$S)),
(f32 (COPY_TO_REGCLASS (XSRDPIC
(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(f64 (fnearbyint f64:$S)),
+def : Pat<(f64 (strict_fnearbyint f64:$S)),
(f64 (XSRDPIC $S))>;
-def : Pat<(v2f64 (fnearbyint v2f64:$S)),
+def : Pat<(v2f64 (strict_fnearbyint v2f64:$S)),
(v2f64 (XVRDPIC $S))>;
-def : Pat<(v4f32 (fnearbyint v4f32:$S)),
+def : Pat<(v4f32 (strict_fnearbyint v4f32:$S)),
(v4f32 (XVRSPIC $S))>;
// Materialize a zero-vector of long long
@@ -3578,25 +3580,25 @@ def : Pat<(f64 (bitconvert i64:$S)),
(f64 (MTVSRD $S))>;
// Rounding to integer.
-def : Pat<(i64 (lrint f64:$S)),
+def : Pat<(i64 (strict_lrint f64:$S)),
(i64 (MFVSRD (FCTID $S)))>;
-def : Pat<(i64 (lrint f32:$S)),
+def : Pat<(i64 (strict_lrint f32:$S)),
(i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
-def : Pat<(i64 (llrint f64:$S)),
+def : Pat<(i64 (strict_llrint f64:$S)),
(i64 (MFVSRD (FCTID $S)))>;
-def : Pat<(i64 (llrint f32:$S)),
+def : Pat<(i64 (strict_llrint f32:$S)),
(i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
-def : Pat<(i64 (lround f64:$S)),
+def : Pat<(i64 (strict_lround f64:$S)),
(i64 (MFVSRD (FCTID (XSRDPI $S))))>;
-def : Pat<(i64 (lround f32:$S)),
+def : Pat<(i64 (strict_lround f32:$S)),
(i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
-def : Pat<(i32 (lround f64:$S)),
+def : Pat<(i32 (strict_lround f64:$S)),
(i32 (MFVSRWZ (FCTIW (XSRDPI $S))))>;
-def : Pat<(i32 (lround f32:$S)),
+def : Pat<(i32 (strict_lround f32:$S)),
(i32 (MFVSRWZ (FCTIW (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
-def : Pat<(i64 (llround f64:$S)),
+def : Pat<(i64 (strict_llround f64:$S)),
(i64 (MFVSRD (FCTID (XSRDPI $S))))>;
-def : Pat<(i64 (llround f32:$S)),
+def : Pat<(i64 (strict_llround f32:$S)),
(i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
// Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0a53ba9..a77d765 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24044,18 +24044,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
}
}
- std::pair<Register, const TargetRegisterClass *> Res =
- TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
-
- // If we picked one of the Zfinx register classes, remap it to the GPR class.
- // FIXME: When Zfinx is supported in CodeGen this will need to take the
- // Subtarget into account.
- if (Res.second == &RISCV::GPRF16RegClass ||
- Res.second == &RISCV::GPRF32RegClass ||
- Res.second == &RISCV::GPRPairRegClass)
- return std::make_pair(Res.first, &RISCV::GPRRegClass);
-
- return Res;
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
InlineAsm::ConstraintCode
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 28ee444..a29faab 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -1368,13 +1368,13 @@ static void ComputeCrossModuleImportForModuleFromIndexForTest(
FunctionImporter::ImportMapTy &ImportList) {
for (const auto &GlobalList : Index) {
// Ignore entries for undefined references.
- if (GlobalList.second.SummaryList.empty())
+ if (GlobalList.second.getSummaryList().empty())
continue;
auto GUID = GlobalList.first;
- assert(GlobalList.second.SummaryList.size() == 1 &&
+ assert(GlobalList.second.getSummaryList().size() == 1 &&
"Expected individual combined index to have one summary per GUID");
- auto &Summary = GlobalList.second.SummaryList[0];
+ auto &Summary = GlobalList.second.getSummaryList()[0];
// Skip the summaries for the importing module. These are included to
// e.g. record required linkage changes.
if (Summary->modulePath() == ModulePath)
@@ -1423,7 +1423,7 @@ void updateValueInfoForIndirectCalls(ModuleSummaryIndex &Index,
void llvm::updateIndirectCalls(ModuleSummaryIndex &Index) {
for (const auto &Entry : Index) {
- for (const auto &S : Entry.second.SummaryList) {
+ for (const auto &S : Entry.second.getSummaryList()) {
if (auto *FS = dyn_cast<FunctionSummary>(S.get()))
updateValueInfoForIndirectCalls(Index, FS);
}
@@ -1456,7 +1456,7 @@ void llvm::computeDeadSymbolsAndUpdateIndirectCalls(
// Add values flagged in the index as live roots to the worklist.
for (const auto &Entry : Index) {
auto VI = Index.getValueInfo(Entry);
- for (const auto &S : Entry.second.SummaryList) {
+ for (const auto &S : Entry.second.getSummaryList()) {
if (auto *FS = dyn_cast<FunctionSummary>(S.get()))
updateValueInfoForIndirectCalls(Index, FS);
if (S->isLive()) {
@@ -2094,7 +2094,7 @@ static bool doImportingForModuleForTest(
// is only enabled when testing importing via the 'opt' tool, which does
// not do the ThinLink that would normally determine what values to promote.
for (auto &I : *Index) {
- for (auto &S : I.second.SummaryList) {
+ for (auto &S : I.second.getSummaryList()) {
if (GlobalValue::isLocalLinkage(S->linkage()))
S->setLinkage(GlobalValue::ExternalLinkage);
}
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index be6cba3..46fb567 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -2130,7 +2130,7 @@ bool LowerTypeTestsModule::lower() {
// A set of all functions that are address taken by a live global object.
DenseSet<GlobalValue::GUID> AddressTaken;
for (auto &I : *ExportSummary)
- for (auto &GVS : I.second.SummaryList)
+ for (auto &GVS : I.second.getSummaryList())
if (GVS->isLive())
for (const auto &Ref : GVS->refs()) {
AddressTaken.insert(Ref.getGUID());
@@ -2409,7 +2409,7 @@ bool LowerTypeTestsModule::lower() {
}
for (auto &P : *ExportSummary) {
- for (auto &S : P.second.SummaryList) {
+ for (auto &S : P.second.getSummaryList()) {
if (!ExportSummary->isGlobalValueLive(S.get()))
continue;
if (auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject()))
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 2d5cb82..76e588b 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -928,7 +928,7 @@ void llvm::updateVCallVisibilityInIndex(
// linker, as we have no information on their eventual use.
if (DynamicExportSymbols.count(P.first))
continue;
- for (auto &S : P.second.SummaryList) {
+ for (auto &S : P.second.getSummaryList()) {
auto *GVar = dyn_cast<GlobalVarSummary>(S.get());
if (!GVar ||
GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
@@ -2413,7 +2413,7 @@ bool DevirtModule::run() {
}
for (auto &P : *ExportSummary) {
- for (auto &S : P.second.SummaryList) {
+ for (auto &S : P.second.getSummaryList()) {
auto *FS = dyn_cast<FunctionSummary>(S.get());
if (!FS)
continue;
@@ -2564,7 +2564,7 @@ void DevirtIndex::run() {
// Collect information from summary about which calls to try to devirtualize.
for (auto &P : ExportSummary) {
- for (auto &S : P.second.SummaryList) {
+ for (auto &S : P.second.getSummaryList()) {
auto *FS = dyn_cast<FunctionSummary>(S.get());
if (!FS)
continue;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index cdc559b..9b9fe26 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1643,33 +1643,46 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) {
/// Return a Constant* for the specified floating-point constant if it fits
/// in the specified FP type without changing its value.
-static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
+static bool fitsInFPType(APFloat F, const fltSemantics &Sem) {
bool losesInfo;
- APFloat F = CFP->getValueAPF();
(void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
return !losesInfo;
}
-static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) {
- if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext()))
- return nullptr; // No constant folding of this.
+static Type *shrinkFPConstant(LLVMContext &Ctx, const APFloat &F,
+ bool PreferBFloat) {
// See if the value can be truncated to bfloat and then reextended.
- if (PreferBFloat && fitsInFPType(CFP, APFloat::BFloat()))
- return Type::getBFloatTy(CFP->getContext());
+ if (PreferBFloat && fitsInFPType(F, APFloat::BFloat()))
+ return Type::getBFloatTy(Ctx);
// See if the value can be truncated to half and then reextended.
- if (!PreferBFloat && fitsInFPType(CFP, APFloat::IEEEhalf()))
- return Type::getHalfTy(CFP->getContext());
+ if (!PreferBFloat && fitsInFPType(F, APFloat::IEEEhalf()))
+ return Type::getHalfTy(Ctx);
// See if the value can be truncated to float and then reextended.
- if (fitsInFPType(CFP, APFloat::IEEEsingle()))
- return Type::getFloatTy(CFP->getContext());
- if (CFP->getType()->isDoubleTy())
- return nullptr; // Won't shrink.
- if (fitsInFPType(CFP, APFloat::IEEEdouble()))
- return Type::getDoubleTy(CFP->getContext());
+ if (fitsInFPType(F, APFloat::IEEEsingle()))
+ return Type::getFloatTy(Ctx);
+ if (&F.getSemantics() == &APFloat::IEEEdouble())
+ return nullptr; // Won't shrink.
+ // See if the value can be truncated to double and then reextended.
+ if (fitsInFPType(F, APFloat::IEEEdouble()))
+ return Type::getDoubleTy(Ctx);
// Don't try to shrink to various long double types.
return nullptr;
}
+static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) {
+ Type *Ty = CFP->getType();
+ if (Ty->getScalarType()->isPPC_FP128Ty())
+ return nullptr; // No constant folding of this.
+
+ Type *ShrinkTy =
+ shrinkFPConstant(CFP->getContext(), CFP->getValueAPF(), PreferBFloat);
+ if (ShrinkTy)
+ if (auto *VecTy = dyn_cast<VectorType>(Ty))
+ ShrinkTy = VectorType::get(ShrinkTy, VecTy);
+
+ return ShrinkTy;
+}
+
// Determine if this is a vector of ConstantFPs and if so, return the minimal
// type we can safely truncate all elements to.
static Type *shrinkFPConstantVector(Value *V, bool PreferBFloat) {
@@ -1720,10 +1733,10 @@ static Type *getMinimumFPType(Value *V, bool PreferBFloat) {
// Try to shrink scalable and fixed splat vectors.
if (auto *FPC = dyn_cast<Constant>(V))
- if (isa<VectorType>(V->getType()))
+ if (auto *VTy = dyn_cast<VectorType>(V->getType()))
if (auto *Splat = dyn_cast_or_null<ConstantFP>(FPC->getSplatValue()))
if (Type *T = shrinkFPConstant(Splat, PreferBFloat))
- return T;
+ return VectorType::get(T, VTy);
// Try to shrink a vector of FP constants. This returns nullptr on scalable
// vectors
@@ -1796,10 +1809,9 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
Type *Ty = FPT.getType();
auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0));
if (BO && BO->hasOneUse()) {
- Type *LHSMinType =
- getMinimumFPType(BO->getOperand(0), /*PreferBFloat=*/Ty->isBFloatTy());
- Type *RHSMinType =
- getMinimumFPType(BO->getOperand(1), /*PreferBFloat=*/Ty->isBFloatTy());
+ bool PreferBFloat = Ty->getScalarType()->isBFloatTy();
+ Type *LHSMinType = getMinimumFPType(BO->getOperand(0), PreferBFloat);
+ Type *RHSMinType = getMinimumFPType(BO->getOperand(1), PreferBFloat);
unsigned OpWidth = BO->getType()->getFPMantissaWidth();
unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1cc9173..adf27be 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7231,9 +7231,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
return DenseMap<const SCEV *, Value *>();
}
- VPlanTransforms::narrowInterleaveGroups(
- BestVPlan, BestVF,
- TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
VPlanTransforms::removeDeadRecipes(BestVPlan);
VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -8202,6 +8199,10 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
if (CM.foldTailWithEVL())
VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
*Plan, CM.getMaxSafeElements());
+
+ if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
+ VPlans.push_back(std::move(P));
+
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3f18bd7..cdb9e7e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5577,62 +5577,79 @@ private:
}
// Decrement the unscheduled counter and insert to ready list if
// ready.
- auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
- unsigned OpIdx) {
- if (!ScheduleCopyableDataMap.empty()) {
- const EdgeInfo EI = {UserTE, OpIdx};
- if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {
- DecrUnsched(CD, /*IsControl=*/false);
- return;
- }
- }
- auto It = OperandsUses.find(I);
- assert(It != OperandsUses.end() && "Operand not found");
- if (It->second > 0) {
- --It->getSecond();
- assert(TotalOpCount > 0 && "No more operands to decrement");
- --TotalOpCount;
- if (ScheduleData *OpSD = getScheduleData(I))
- DecrUnsched(OpSD, /*IsControl=*/false);
- }
- };
+ auto DecrUnschedForInst =
+ [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
+ SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
+ &Checked) {
+ if (!ScheduleCopyableDataMap.empty()) {
+ const EdgeInfo EI = {UserTE, OpIdx};
+ if (ScheduleCopyableData *CD =
+ getScheduleCopyableData(EI, I)) {
+ if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
+ return;
+ DecrUnsched(CD, /*IsControl=*/false);
+ return;
+ }
+ }
+ auto It = OperandsUses.find(I);
+ assert(It != OperandsUses.end() && "Operand not found");
+ if (It->second > 0) {
+ --It->getSecond();
+ assert(TotalOpCount > 0 && "No more operands to decrement");
+ --TotalOpCount;
+ if (ScheduleData *OpSD = getScheduleData(I)) {
+ if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
+ return;
+ DecrUnsched(OpSD, /*IsControl=*/false);
+ }
+ }
+ };
for (ScheduleBundle *Bundle : Bundles) {
if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
break;
// Need to search for the lane since the tree entry can be
// reordered.
- int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
- find(Bundle->getTreeEntry()->Scalars, In));
- assert(Lane >= 0 && "Lane not set");
- if (isa<StoreInst>(In) &&
- !Bundle->getTreeEntry()->ReorderIndices.empty())
- Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
- assert(Lane < static_cast<int>(
- Bundle->getTreeEntry()->Scalars.size()) &&
- "Couldn't find extract lane");
-
- // Since vectorization tree is being built recursively this
- // assertion ensures that the tree entry has all operands set before
- // reaching this code. Couple of exceptions known at the moment are
- // extracts where their second (immediate) operand is not added.
- // Since immediates do not affect scheduler behavior this is
- // considered okay.
- assert(In &&
- (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
- In->getNumOperands() ==
- Bundle->getTreeEntry()->getNumOperands() ||
- Bundle->getTreeEntry()->isCopyableElement(In)) &&
- "Missed TreeEntry operands?");
-
- for (unsigned OpIdx :
- seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
- if (auto *I = dyn_cast<Instruction>(
- Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
- LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I
- << "\n");
- DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);
- }
+ auto *It = find(Bundle->getTreeEntry()->Scalars, In);
+ SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
+ do {
+ int Lane =
+ std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
+ assert(Lane >= 0 && "Lane not set");
+ if (isa<StoreInst>(In) &&
+ !Bundle->getTreeEntry()->ReorderIndices.empty())
+ Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
+ assert(Lane < static_cast<int>(
+ Bundle->getTreeEntry()->Scalars.size()) &&
+ "Couldn't find extract lane");
+
+ // Since vectorization tree is being built recursively this
+ // assertion ensures that the tree entry has all operands set
+ // before reaching this code. Couple of exceptions known at the
+ // moment are extracts where their second (immediate) operand is
+ // not added. Since immediates do not affect scheduler behavior
+ // this is considered okay.
+ assert(In &&
+ (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
+ In->getNumOperands() ==
+ Bundle->getTreeEntry()->getNumOperands() ||
+ Bundle->getTreeEntry()->isCopyableElement(In)) &&
+ "Missed TreeEntry operands?");
+
+ for (unsigned OpIdx :
+ seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
+ if (auto *I = dyn_cast<Instruction>(
+ Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
+ LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
+ << *I << "\n");
+ DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
+ }
+ // If parent node is schedulable, it will be handle correctly.
+ if (!Bundle->getTreeEntry()->doesNotNeedToSchedule())
+ break;
+ It = std::find(std::next(It),
+ Bundle->getTreeEntry()->Scalars.end(), In);
+ } while (It != Bundle->getTreeEntry()->Scalars.end());
}
} else {
// If BundleMember is a stand-alone instruction, no operand reordering
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index d167009..b4e4dc2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1213,6 +1213,7 @@ VPlan *VPlan::duplicate() {
}
Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
Old2NewVPValues[&VF] = &NewPlan->VF;
+ Old2NewVPValues[&UF] = &NewPlan->UF;
Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
if (BackedgeTakenCount) {
NewPlan->BackedgeTakenCount = new VPValue();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fed04eb..8274431 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4152,6 +4152,9 @@ class VPlan {
/// Represents the vectorization factor of the loop.
VPValue VF;
+ /// Represents the symbolic unroll factor of the loop.
+ VPValue UF;
+
/// Represents the loop-invariant VF * UF of the vector loop region.
VPValue VFxUF;
@@ -4305,6 +4308,9 @@ public:
VPValue &getVF() { return VF; };
const VPValue &getVF() const { return VF; };
+ /// Returns the symbolic UF of the vector loop region.
+ VPValue &getSymbolicUF() { return UF; };
+
/// Returns VF * UF of the vector loop region.
VPValue &getVFxUF() { return VFxUF; }
@@ -4314,6 +4320,12 @@ public:
void addVF(ElementCount VF) { VFs.insert(VF); }
+ /// Remove \p VF from the plan.
+ void removeVF(ElementCount VF) {
+ assert(hasVF(VF) && "tried to remove VF not present in plan");
+ VFs.remove(VF);
+ }
+
void setVF(ElementCount VF) {
assert(hasVF(VF) && "Cannot set VF not already in plan");
VFs.clear();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e060e70..7bf8d83 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1478,11 +1478,8 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
if (!Plan.getVectorLoopRegion())
return false;
- if (!Plan.getTripCount()->isLiveIn())
- return false;
- auto *TC = dyn_cast_if_present<ConstantInt>(
- Plan.getTripCount()->getUnderlyingValue());
- if (!TC || !BestVF.isFixed())
+ const APInt *TC;
+ if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
return false;
// Calculate the minimum power-of-2 bit width that can fit the known TC, VF
@@ -1495,7 +1492,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
};
unsigned NewBitWidth =
- ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
+ ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
LLVMContext &Ctx = Plan.getContext();
auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
@@ -2092,8 +2089,8 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
// Recipes in replicate regions implicitly depend on predicate. If either
// recipe is in a replicate region, only consider them equal if both have
// the same parent.
- const VPRegionBlock *RegionL = L->getParent()->getParent();
- const VPRegionBlock *RegionR = R->getParent()->getParent();
+ const VPRegionBlock *RegionL = L->getRegion();
+ const VPRegionBlock *RegionR = R->getRegion();
if (((RegionL && RegionL->isReplicator()) ||
(RegionR && RegionR->isReplicator())) &&
L->getParent() != R->getParent())
@@ -3867,8 +3864,7 @@ void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
// required lanes implicitly.
// TODO: Remove once replicate regions are unrolled completely.
auto IsCandidateUnpackUser = [Def](VPUser *U) {
- VPRegionBlock *ParentRegion =
- cast<VPRecipeBase>(U)->getParent()->getParent();
+ VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
return U->usesScalars(Def) &&
(!ParentRegion || !ParentRegion->isReplicator());
};
@@ -3960,6 +3956,9 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
// used.
// TODO: Assert that they aren't used.
+ VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
+ Plan.getSymbolicUF().replaceAllUsesWith(UF);
+
// If there are no users of the runtime VF, compute VFxUF by constant folding
// the multiplication of VF and UF.
if (VF.getNumUsers() == 0) {
@@ -3979,7 +3978,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
}
VF.replaceAllUsesWith(RuntimeVF);
- VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
VFxUF.replaceAllUsesWith(MulByUF);
}
@@ -4047,14 +4045,14 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
return false;
}
-/// Returns true if \p IR is a full interleave group with factor and number of
-/// members both equal to \p VF. The interleave group must also access the full
-/// vector width \p VectorRegWidth.
-static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
- unsigned VF, VPTypeAnalysis &TypeInfo,
- unsigned VectorRegWidth) {
+/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
+/// number of members both equal to VF. The interleave group must also access
+/// the full vector width.
+static std::optional<ElementCount> isConsecutiveInterleaveGroup(
+ VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
+ VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
if (!InterleaveR)
- return false;
+ return std::nullopt;
Type *GroupElementTy = nullptr;
if (InterleaveR->getStoredValues().empty()) {
@@ -4063,7 +4061,7 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
[&TypeInfo, GroupElementTy](VPValue *Op) {
return TypeInfo.inferScalarType(Op) == GroupElementTy;
}))
- return false;
+ return std::nullopt;
} else {
GroupElementTy =
TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
@@ -4071,13 +4069,27 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
[&TypeInfo, GroupElementTy](VPValue *Op) {
return TypeInfo.inferScalarType(Op) == GroupElementTy;
}))
- return false;
+ return std::nullopt;
}
- unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
- auto IG = InterleaveR->getInterleaveGroup();
- return IG->getFactor() == VF && IG->getNumMembers() == VF &&
- GroupSize == VectorRegWidth;
+ auto GetVectorWidthForVF = [&TTI](ElementCount VF) {
+ TypeSize Size = TTI.getRegisterBitWidth(
+ VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector
+ : TargetTransformInfo::RGK_ScalableVector);
+ assert(Size.isScalable() == VF.isScalable() &&
+ "if Size is scalable, VF must to and vice versa");
+ return Size.getKnownMinValue();
+ };
+
+ for (ElementCount VF : VFs) {
+ unsigned MinVal = VF.getKnownMinValue();
+ unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
+ auto IG = InterleaveR->getInterleaveGroup();
+ if (IG->getFactor() == MinVal && IG->getNumMembers() == MinVal &&
+ GroupSize == GetVectorWidthForVF(VF))
+ return {VF};
+ }
+ return std::nullopt;
}
/// Returns true if \p VPValue is a narrow VPValue.
@@ -4088,16 +4100,18 @@ static bool isAlreadyNarrow(VPValue *VPV) {
return RepR && RepR->isSingleScalar();
}
-void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
- unsigned VectorRegWidth) {
+std::unique_ptr<VPlan>
+VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
+ const TargetTransformInfo &TTI) {
+ using namespace llvm::VPlanPatternMatch;
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
+
if (!VectorLoop)
- return;
+ return nullptr;
VPTypeAnalysis TypeInfo(Plan);
-
- unsigned VFMinVal = VF.getKnownMinValue();
SmallVector<VPInterleaveRecipe *> StoreGroups;
+ std::optional<ElementCount> VFToOptimize;
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount()))
continue;
@@ -4111,30 +4125,33 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// * recipes writing to memory except interleave groups
// Only support plans with a canonical induction phi.
if (R.isPhi())
- return;
+ return nullptr;
auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
if (R.mayWriteToMemory() && !InterleaveR)
- return;
-
- // Do not narrow interleave groups if there are VectorPointer recipes and
- // the plan was unrolled. The recipe implicitly uses VF from
- // VPTransformState.
- // TODO: Remove restriction once the VF for the VectorPointer offset is
- // modeled explicitly as operand.
- if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
- return;
+ return nullptr;
// All other ops are allowed, but we reject uses that cannot be converted
// when checking all allowed consumers (store interleave groups) below.
if (!InterleaveR)
continue;
- // Bail out on non-consecutive interleave groups.
- if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
- VectorRegWidth))
- return;
-
+ // Try to find a single VF, where all interleave groups are consecutive and
+ // saturate the full vector width. If we already have a candidate VF, check
+ // if it is applicable for the current InterleaveR, otherwise look for a
+ // suitable VF across the Plans VFs.
+ //
+ if (VFToOptimize) {
+ if (!isConsecutiveInterleaveGroup(InterleaveR, {*VFToOptimize}, TypeInfo,
+ TTI))
+ return nullptr;
+ } else {
+ if (auto VF = isConsecutiveInterleaveGroup(
+ InterleaveR, to_vector(Plan.vectorFactors()), TypeInfo, TTI))
+ VFToOptimize = *VF;
+ else
+ return nullptr;
+ }
// Skip read interleave groups.
if (InterleaveR->getStoredValues().empty())
continue;
@@ -4168,24 +4185,34 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
InterleaveR->getStoredValues()[0]->getDefiningRecipe());
if (!WideMember0)
- return;
+ return nullptr;
for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
if (!R || R->getOpcode() != WideMember0->getOpcode() ||
R->getNumOperands() > 2)
- return;
+ return nullptr;
if (any_of(enumerate(R->operands()),
[WideMember0, Idx = I](const auto &P) {
const auto &[OpIdx, OpV] = P;
return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
}))
- return;
+ return nullptr;
}
StoreGroups.push_back(InterleaveR);
}
if (StoreGroups.empty())
- return;
+ return nullptr;
+
+ // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
+ // original Plan into 2: a) a new clone which contains all VFs of Plan, except
+ // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
+ std::unique_ptr<VPlan> NewPlan;
+ if (size(Plan.vectorFactors()) != 1) {
+ NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
+ Plan.setVF(*VFToOptimize);
+ NewPlan->removeVF(*VFToOptimize);
+ }
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
SmallPtrSet<VPValue *, 4> NarrowedOps;
@@ -4256,9 +4283,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
VPBuilder PHBuilder(Plan.getVectorPreheader());
- VPValue *UF = Plan.getOrAddLiveIn(
- ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
- if (VF.isScalable()) {
+ VPValue *UF = &Plan.getSymbolicUF();
+ if (VFToOptimize->isScalable()) {
VPValue *VScale = PHBuilder.createElementCount(
CanIV->getScalarType(), ElementCount::getScalable(1));
VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
@@ -4270,6 +4296,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
}
removeDeadRecipes(Plan);
+ assert(none_of(*VectorLoop->getEntryBasicBlock(),
+ IsaPred<VPVectorPointerRecipe>) &&
+ "All VPVectorPointerRecipes should have been removed");
+ return NewPlan;
}
/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index b28559b..ca8d956 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -341,14 +341,20 @@ struct VPlanTransforms {
static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
ScalarEvolution &SE);
- /// Try to convert a plan with interleave groups with VF elements to a plan
- /// with the interleave groups replaced by wide loads and stores processing VF
- /// elements, if all transformed interleave groups access the full vector
- /// width (checked via \o VectorRegWidth). This effectively is a very simple
- /// form of loop-aware SLP, where we use interleave groups to identify
- /// candidates.
- static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
- unsigned VectorRegWidth);
+ /// Try to find a single VF among \p Plan's VFs for which all interleave
+ /// groups (with known minimum VF elements) can be replaced by wide loads and
+ /// stores processing VF elements, if all transformed interleave groups access
+ /// the full vector width (checked via the maximum vector register width). If
+ /// the transformation can be applied, the original \p Plan will be split in
+ /// 2:
+ /// 1. The original Plan with the single VF containing the optimized recipes
+ /// using wide loads instead of interleave groups.
+ /// 2. A new clone which contains all VFs of Plan except the optimized VF.
+ ///
+ /// This effectively is a very simple form of loop-aware SLP, where we use
+ /// interleave groups to identify candidates.
+ static std::unique_ptr<VPlan>
+ narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);
/// Predicate and linearize the control-flow in the only loop region of
/// \p Plan. If \p FoldTail is true, create a mask guarding the loop