aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/ConstantFolding.cpp3
-rw-r--r--llvm/lib/Analysis/Delinearization.cpp200
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp3
-rw-r--r--llvm/lib/AsmParser/LLLexer.cpp1
-rw-r--r--llvm/lib/AsmParser/LLParser.cpp2
-rw-r--r--llvm/lib/Bitcode/Reader/BitcodeReader.cpp1
-rw-r--r--llvm/lib/Bitcode/Writer/BitcodeWriter.cpp1
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp1
-rw-r--r--llvm/lib/CodeGen/RegisterPressure.cpp4
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp32
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp5
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h1
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp1
-rw-r--r--llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp2
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp10
-rw-r--r--llvm/lib/IR/ConstantFold.cpp1
-rw-r--r--llvm/lib/IR/ConstantRange.cpp1
-rw-r--r--llvm/lib/IR/Constants.cpp20
-rw-r--r--llvm/lib/IR/DebugInfoMetadata.cpp10
-rw-r--r--llvm/lib/IR/Globals.cpp1
-rw-r--r--llvm/lib/IR/Instruction.cpp1
-rw-r--r--llvm/lib/IR/Instructions.cpp52
-rw-r--r--llvm/lib/IR/LLVMContextImpl.h27
-rw-r--r--llvm/lib/IR/Value.cpp46
-rw-r--r--llvm/lib/IR/Verifier.cpp31
-rw-r--r--llvm/lib/ProfileData/InstrProfWriter.cpp63
-rw-r--r--llvm/lib/SandboxIR/Context.cpp1
-rw-r--r--llvm/lib/SandboxIR/Instruction.cpp3
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp99
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp85
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp56
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h43
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp145
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h9
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h15
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp160
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td37
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/InferAlignment.cpp49
-rw-r--r--llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp1
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp66
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp3
-rw-r--r--llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp1
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp10
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h10
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp61
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h6
51 files changed, 898 insertions, 517 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index dd98b62..c14cb9e 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1485,6 +1485,9 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
switch (Opcode) {
default:
llvm_unreachable("Missing case");
+ case Instruction::PtrToAddr:
+ // TODO: Add some of the ptrtoint folds here as well.
+ break;
case Instruction::PtrToInt:
if (auto *CE = dyn_cast<ConstantExpr>(C)) {
Constant *FoldedValue = nullptr;
diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp
index 329bd35..761c566 100644
--- a/llvm/lib/Analysis/Delinearization.cpp
+++ b/llvm/lib/Analysis/Delinearization.cpp
@@ -24,6 +24,7 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -32,6 +33,11 @@ using namespace llvm;
#define DL_NAME "delinearize"
#define DEBUG_TYPE DL_NAME
+static cl::opt<bool> UseFixedSizeArrayHeuristic(
+ "delinearize-use-fixed-size-array-heuristic", cl::init(false), cl::Hidden,
+ cl::desc("When printing analysis, use the heuristic for fixed-size arrays "
+ "if the default delinearizetion fails."));
+
// Return true when S contains at least an undef value.
static inline bool containsUndefs(const SCEV *S) {
return SCEVExprContains(S, [](const SCEV *S) {
@@ -480,6 +486,184 @@ void llvm::delinearize(ScalarEvolution &SE, const SCEV *Expr,
});
}
+static std::optional<APInt> tryIntoAPInt(const SCEV *S) {
+ if (const auto *Const = dyn_cast<SCEVConstant>(S))
+ return Const->getAPInt();
+ return std::nullopt;
+}
+
+/// Collects the absolute values of constant steps for all induction variables.
+/// Returns true if we can prove that all step recurrences are constants and \p
+/// Expr is divisible by \p ElementSize. Each step recurrence is stored in \p
+/// Steps after divided by \p ElementSize.
+static bool collectConstantAbsSteps(ScalarEvolution &SE, const SCEV *Expr,
+ SmallVectorImpl<uint64_t> &Steps,
+ uint64_t ElementSize) {
+ // End of recursion. The constant value also must be a multiple of
+ // ElementSize.
+ if (const auto *Const = dyn_cast<SCEVConstant>(Expr)) {
+ const uint64_t Mod = Const->getAPInt().urem(ElementSize);
+ return Mod == 0;
+ }
+
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Expr);
+ if (!AR || !AR->isAffine())
+ return false;
+
+ const SCEV *Step = AR->getStepRecurrence(SE);
+ std::optional<APInt> StepAPInt = tryIntoAPInt(Step);
+ if (!StepAPInt)
+ return false;
+
+ APInt Q;
+ uint64_t R;
+ APInt::udivrem(StepAPInt->abs(), ElementSize, Q, R);
+ if (R != 0)
+ return false;
+
+ // Bail out when the step is too large.
+ std::optional<uint64_t> StepVal = Q.tryZExtValue();
+ if (!StepVal)
+ return false;
+
+ Steps.push_back(*StepVal);
+ return collectConstantAbsSteps(SE, AR->getStart(), Steps, ElementSize);
+}
+
+bool llvm::findFixedSizeArrayDimensions(ScalarEvolution &SE, const SCEV *Expr,
+ SmallVectorImpl<uint64_t> &Sizes,
+ const SCEV *ElementSize) {
+ if (!ElementSize)
+ return false;
+
+ std::optional<APInt> ElementSizeAPInt = tryIntoAPInt(ElementSize);
+ if (!ElementSizeAPInt || *ElementSizeAPInt == 0)
+ return false;
+
+ std::optional<uint64_t> ElementSizeConst = ElementSizeAPInt->tryZExtValue();
+
+ // Early exit when ElementSize is not a positive constant.
+ if (!ElementSizeConst)
+ return false;
+
+ if (!collectConstantAbsSteps(SE, Expr, Sizes, *ElementSizeConst) ||
+ Sizes.empty()) {
+ Sizes.clear();
+ return false;
+ }
+
+ // At this point, Sizes contains the absolute step recurrences for all
+ // induction variables. Each step recurrence must be a multiple of the size of
+ // the array element. Assuming that the each value represents the size of an
+ // array for each dimension, attempts to restore the length of each dimension
+ // by dividing the step recurrence by the next smaller value. For example, if
+ // we have the following AddRec SCEV:
+ //
+ // AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8)
+ //
+ // Then Sizes will become [256, 32, 1] after sorted. We don't know the size of
+ // the outermost dimension, the next dimension will be computed as 256 / 32 =
+ // 8, and the last dimension will be computed as 32 / 1 = 32. Thus it results
+ // in like Arr[UnknownSize][8][32] with elements of size 8 bytes, where Arr is
+ // a base pointer.
+ //
+ // TODO: Catch more cases, e.g., when a step recurrence is not divisible by
+ // the next smaller one, like A[i][3*j].
+ llvm::sort(Sizes.rbegin(), Sizes.rend());
+ Sizes.erase(llvm::unique(Sizes), Sizes.end());
+
+ // The last element in Sizes should be ElementSize. At this point, all values
+ // in Sizes are assumed to be divided by ElementSize, so replace it with 1.
+ assert(Sizes.back() != 0 && "Unexpected zero size in Sizes.");
+ Sizes.back() = 1;
+
+ for (unsigned I = 0; I + 1 < Sizes.size(); I++) {
+ uint64_t PrevSize = Sizes[I + 1];
+ if (Sizes[I] % PrevSize) {
+ Sizes.clear();
+ return false;
+ }
+ Sizes[I] /= PrevSize;
+ }
+
+ // Finally, the last element in Sizes should be ElementSize.
+ Sizes.back() = *ElementSizeConst;
+ return true;
+}
+
+/// Splits the SCEV into two vectors of SCEVs representing the subscripts and
+/// sizes of an array access, assuming that the array is a fixed size array.
+///
+/// E.g., if we have the code like as follows:
+///
+/// double A[42][8][32];
+/// for i
+/// for j
+/// for k
+/// use A[i][j][k]
+///
+/// The access function will be represented as an AddRec SCEV like:
+///
+/// AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8)
+///
+/// Then findFixedSizeArrayDimensions infers the size of each dimension of the
+/// array based on the fact that the value of the step recurrence is a multiple
+/// of the size of the corresponding array element. In the above example, it
+/// results in the following:
+///
+/// CHECK: ArrayDecl[UnknownSize][8][32] with elements of 8 bytes.
+///
+/// Finally each subscript will be computed as follows:
+///
+/// CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>]
+///
+/// Note that this function doesn't check the range of possible values for each
+/// subscript, so the caller should perform additional boundary checks if
+/// necessary.
+///
+/// Also note that this function doesn't guarantee that the original array size
+/// is restored "correctly". For example, in the following case:
+///
+/// double A[42][4][64];
+/// double B[42][8][32];
+/// for i
+/// for j
+/// for k
+/// use A[i][j][k]
+/// use B[i][2*j][k]
+///
+/// The access function for both accesses will be the same:
+///
+/// AddRec: {{{0,+,2048}<%for.i>,+,512}<%for.j>,+,8}<%for.k> (ElementSize=8)
+///
+/// The array sizes for both A and B will be computed as
+/// ArrayDecl[UnknownSize][4][64], which matches for A, but not for B.
+///
+/// TODO: At the moment, this function can handle only simple cases. For
+/// example, we cannot handle a case where a step recurrence is not divisible
+/// by the next smaller step recurrence, e.g., A[i][3*j].
+bool llvm::delinearizeFixedSizeArray(ScalarEvolution &SE, const SCEV *Expr,
+ SmallVectorImpl<const SCEV *> &Subscripts,
+ SmallVectorImpl<const SCEV *> &Sizes,
+ const SCEV *ElementSize) {
+
+ // First step: find the fixed array size.
+ SmallVector<uint64_t, 4> ConstSizes;
+ if (!findFixedSizeArrayDimensions(SE, Expr, ConstSizes, ElementSize)) {
+ Sizes.clear();
+ return false;
+ }
+
+ // Convert the constant size to SCEV.
+ for (uint64_t Size : ConstSizes)
+ Sizes.push_back(SE.getConstant(Expr->getType(), Size));
+
+ // Second step: compute the access functions for each subscript.
+ computeAccessFunctions(SE, Expr, Subscripts, Sizes);
+
+ return !Subscripts.empty();
+}
+
bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE,
const GetElementPtrInst *GEP,
SmallVectorImpl<const SCEV *> &Subscripts,
@@ -586,9 +770,21 @@ void printDelinearization(raw_ostream &O, Function *F, LoopInfo *LI,
O << "AccessFunction: " << *AccessFn << "\n";
SmallVector<const SCEV *, 3> Subscripts, Sizes;
+
+ auto IsDelinearizationFailed = [&]() {
+ return Subscripts.size() == 0 || Sizes.size() == 0 ||
+ Subscripts.size() != Sizes.size();
+ };
+
delinearize(*SE, AccessFn, Subscripts, Sizes, SE->getElementSize(&Inst));
- if (Subscripts.size() == 0 || Sizes.size() == 0 ||
- Subscripts.size() != Sizes.size()) {
+ if (UseFixedSizeArrayHeuristic && IsDelinearizationFailed()) {
+ Subscripts.clear();
+ Sizes.clear();
+ delinearizeFixedSizeArray(*SE, AccessFn, Subscripts, Sizes,
+ SE->getElementSize(&Inst));
+ }
+
+ if (IsDelinearizationFailed()) {
O << "failed to delinearize\n";
continue;
}
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 1e70228..b0e4b00 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9147,7 +9147,8 @@ static bool matchTwoInputRecurrence(const PHINode *PN, InstTy *&Inst,
return false;
for (unsigned I = 0; I != 2; ++I) {
- if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I))) {
+ if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I));
+ Operation && Operation->getNumOperands() >= 2) {
Value *LHS = Operation->getOperand(0);
Value *RHS = Operation->getOperand(1);
if (LHS != PN && RHS != PN)
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 520c6a0..3d5bd61 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -928,6 +928,7 @@ lltok::Kind LLLexer::LexIdentifier() {
INSTKEYWORD(fptoui, FPToUI);
INSTKEYWORD(fptosi, FPToSI);
INSTKEYWORD(inttoptr, IntToPtr);
+ INSTKEYWORD(ptrtoaddr, PtrToAddr);
INSTKEYWORD(ptrtoint, PtrToInt);
INSTKEYWORD(bitcast, BitCast);
INSTKEYWORD(addrspacecast, AddrSpaceCast);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 13bef1f..1bc2906 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4273,6 +4273,7 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
case lltok::kw_bitcast:
case lltok::kw_addrspacecast:
case lltok::kw_inttoptr:
+ case lltok::kw_ptrtoaddr:
case lltok::kw_ptrtoint: {
unsigned Opc = Lex.getUIntVal();
Type *DestTy = nullptr;
@@ -7310,6 +7311,7 @@ int LLParser::parseInstruction(Instruction *&Inst, BasicBlock *BB,
case lltok::kw_fptoui:
case lltok::kw_fptosi:
case lltok::kw_inttoptr:
+ case lltok::kw_ptrtoaddr:
case lltok::kw_ptrtoint:
return parseCast(Inst, PFS, KeywordVal);
case lltok::kw_fptrunc:
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 290d873..22a0d0f 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1283,6 +1283,7 @@ static int getDecodedCastOpcode(unsigned Val) {
case bitc::CAST_SITOFP : return Instruction::SIToFP;
case bitc::CAST_FPTRUNC : return Instruction::FPTrunc;
case bitc::CAST_FPEXT : return Instruction::FPExt;
+ case bitc::CAST_PTRTOADDR: return Instruction::PtrToAddr;
case bitc::CAST_PTRTOINT: return Instruction::PtrToInt;
case bitc::CAST_INTTOPTR: return Instruction::IntToPtr;
case bitc::CAST_BITCAST : return Instruction::BitCast;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 05680fa..a3f8254 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -647,6 +647,7 @@ static unsigned getEncodedCastOpcode(unsigned Opcode) {
case Instruction::SIToFP : return bitc::CAST_SITOFP;
case Instruction::FPTrunc : return bitc::CAST_FPTRUNC;
case Instruction::FPExt : return bitc::CAST_FPEXT;
+ case Instruction::PtrToAddr: return bitc::CAST_PTRTOADDR;
case Instruction::PtrToInt: return bitc::CAST_PTRTOINT;
case Instruction::IntToPtr: return bitc::CAST_INTTOPTR;
case Instruction::BitCast : return bitc::CAST_BITCAST;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index c72b6e8..23a3543 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3657,6 +3657,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV,
break; // Error
}
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt: {
const DataLayout &DL = getDataLayout();
diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp
index ca51b67..5f37890 100644
--- a/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -1001,7 +1001,7 @@ static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
++CritIdx;
if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == i) {
- int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].getUnitInc();
+ int PDiff = (int)PNew - CriticalPSets[CritIdx].getUnitInc();
if (PDiff > 0) {
Delta.CriticalMax = PressureChange(i);
Delta.CriticalMax.setUnitInc(PDiff);
@@ -1191,7 +1191,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
++CritIdx;
if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) {
- int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc();
+ int CritInc = (int)MNew - CriticalPSets[CritIdx].getUnitInc();
if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) {
Delta.CriticalMax = PressureChange(PSetID);
Delta.CriticalMax.setUnitInc(CritInc);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5f1e38a..17703f5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16342,6 +16342,38 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
DAG, DL);
}
break;
+ case ISD::ABDU:
+ case ISD::ABDS:
+ // (trunc (abdu/abds a, b)) → (abdu/abds (trunc a), (trunc b))
+ if (!LegalOperations || N0.hasOneUse()) {
+ EVT SrcVT = N0.getValueType();
+ EVT TruncVT = VT;
+ unsigned SrcBits = SrcVT.getScalarSizeInBits();
+ unsigned TruncBits = TruncVT.getScalarSizeInBits();
+ unsigned NeededBits = SrcBits - TruncBits;
+
+ SDValue A = N0.getOperand(0);
+ SDValue B = N0.getOperand(1);
+ bool CanFold = false;
+
+ if (N0.getOpcode() == ISD::ABDU) {
+ KnownBits KnownA = DAG.computeKnownBits(A);
+ KnownBits KnownB = DAG.computeKnownBits(B);
+ CanFold = KnownA.countMinLeadingZeros() >= NeededBits &&
+ KnownB.countMinLeadingZeros() >= NeededBits;
+ } else {
+ unsigned SignBitsA = DAG.ComputeNumSignBits(A);
+ unsigned SignBitsB = DAG.ComputeNumSignBits(B);
+ CanFold = SignBitsA > NeededBits && SignBitsB > NeededBits;
+ }
+
+ if (CanFold && TLI.isOperationLegal(N0.getOpcode(), VT)) {
+ SDValue NewA = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, A);
+ SDValue NewB = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, B);
+ return DAG.getNode(N0.getOpcode(), DL, TruncVT, NewA, NewB);
+ }
+ }
+ break;
}
return SDValue();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 0d1e954..48ab797 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3977,6 +3977,11 @@ void SelectionDAGBuilder::visitSIToFP(const User &I) {
setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
}
+void SelectionDAGBuilder::visitPtrToAddr(const User &I) {
+ // FIXME: this is not correct for pointers with addr width != pointer width
+ visitPtrToInt(I);
+}
+
void SelectionDAGBuilder::visitPtrToInt(const User &I) {
// What to do depends on the size of the integer and the size of the pointer.
// We can either truncate, zero extend, or no-op, accordingly.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index c251755..e0835e6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -574,6 +574,7 @@ private:
void visitFPToSI(const User &I);
void visitUIToFP(const User &I);
void visitSIToFP(const User &I);
+ void visitPtrToAddr(const User &I);
void visitPtrToInt(const User &I);
void visitIntToPtr(const User &I);
void visitBitCast(const User &I);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index bf4c9f9..d80a229 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1893,6 +1893,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
case SIToFP: return ISD::SINT_TO_FP;
case FPTrunc: return ISD::FP_ROUND;
case FPExt: return ISD::FP_EXTEND;
+ case PtrToAddr: return ISD::BITCAST;
case PtrToInt: return ISD::BITCAST;
case IntToPtr: return ISD::BITCAST;
case BitCast: return ISD::BITCAST;
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
index 9d84aa8..72308a3d 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
@@ -29,7 +29,7 @@ bool verifyRegisterValue(uint32_t RegisterValue) {
// This Range is reserverved, therefore invalid, according to the spec
// https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#all-the-values-should-be-legal
bool verifyRegisterSpace(uint32_t RegisterSpace) {
- return !(RegisterSpace >= 0xFFFFFFF0 && RegisterSpace <= 0xFFFFFFFF);
+ return !(RegisterSpace >= 0xFFFFFFF0);
}
bool verifyRootDescriptorFlag(uint32_t Version, uint32_t FlagsVal) {
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 35f00ae..b91fd70 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1314,12 +1314,12 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
if ((Name.starts_with("lifetime.start") ||
Name.starts_with("lifetime.end")) &&
F->arg_size() == 2) {
+ Intrinsic::ID IID = Name.starts_with("lifetime.start")
+ ? Intrinsic::lifetime_start
+ : Intrinsic::lifetime_end;
rename(F);
- NewFn = Intrinsic::getOrInsertDeclaration(
- F->getParent(),
- Name.starts_with("lifetime.start") ? Intrinsic::lifetime_start
- : Intrinsic::lifetime_end,
- F->getArg(0)->getType());
+ NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID,
+ F->getArg(0)->getType());
return true;
}
break;
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index d4ad21e..6b202ba 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -254,6 +254,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
return FoldBitCast(V, DestTy);
case Instruction::AddrSpaceCast:
case Instruction::IntToPtr:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
return nullptr;
}
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index e09c139..2fcdbcc6 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -829,6 +829,7 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp,
case Instruction::FPTrunc:
case Instruction::FPExt:
case Instruction::IntToPtr:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::AddrSpaceCast:
// Conservatively return getFull set.
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index a3c725b..c7e3113a 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -1567,6 +1567,7 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
case Instruction::SIToFP:
case Instruction::FPToUI:
case Instruction::FPToSI:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::BitCast:
@@ -2223,6 +2224,8 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty,
llvm_unreachable("Invalid cast opcode");
case Instruction::Trunc:
return getTrunc(C, Ty, OnlyIfReduced);
+ case Instruction::PtrToAddr:
+ return getPtrToAddr(C, Ty, OnlyIfReduced);
case Instruction::PtrToInt:
return getPtrToInt(C, Ty, OnlyIfReduced);
case Instruction::IntToPtr:
@@ -2280,6 +2283,20 @@ Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) {
return getFoldedCast(Instruction::Trunc, C, Ty, OnlyIfReduced);
}
+Constant *ConstantExpr::getPtrToAddr(Constant *C, Type *DstTy,
+ bool OnlyIfReduced) {
+ assert(C->getType()->isPtrOrPtrVectorTy() &&
+ "PtrToAddr source must be pointer or pointer vector");
+ assert(DstTy->isIntOrIntVectorTy() &&
+ "PtrToAddr destination must be integer or integer vector");
+ assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy));
+ if (isa<VectorType>(C->getType()))
+ assert(cast<VectorType>(C->getType())->getElementCount() ==
+ cast<VectorType>(DstTy)->getElementCount() &&
+ "Invalid cast between a different number of vector elements");
+ return getFoldedCast(Instruction::PtrToAddr, C, DstTy, OnlyIfReduced);
+}
+
Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy,
bool OnlyIfReduced) {
assert(C->getType()->isPtrOrPtrVectorTy() &&
@@ -2435,6 +2452,7 @@ bool ConstantExpr::isDesirableCastOp(unsigned Opcode) {
case Instruction::FPToSI:
return false;
case Instruction::Trunc:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::BitCast:
@@ -2457,6 +2475,7 @@ bool ConstantExpr::isSupportedCastOp(unsigned Opcode) {
case Instruction::FPToSI:
return false;
case Instruction::Trunc:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::BitCast:
@@ -3401,6 +3420,7 @@ Instruction *ConstantExpr::getAsInstruction() const {
switch (getOpcode()) {
case Instruction::Trunc:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::BitCast:
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index f1d4549..96065ed 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -57,15 +57,9 @@ DebugVariable::DebugVariable(const DbgVariableRecord *DVR)
DILocation::DILocation(LLVMContext &C, StorageType Storage, unsigned Line,
unsigned Column, uint64_t AtomGroup, uint8_t AtomRank,
ArrayRef<Metadata *> MDs, bool ImplicitCode)
- : MDNode(C, DILocationKind, Storage, MDs)
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
- ,
- AtomGroup(AtomGroup), AtomRank(AtomRank)
-#endif
-{
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
+ : MDNode(C, DILocationKind, Storage, MDs), AtomGroup(AtomGroup),
+ AtomRank(AtomRank) {
assert(AtomRank <= 7 && "AtomRank number should fit in 3 bits");
-#endif
if (AtomGroup)
C.updateDILocationAtomGroupWaterline(AtomGroup + 1);
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 7b799c7..11d33e2 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -404,6 +404,7 @@ findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases,
return findBaseObject(CE->getOperand(0), Aliases, Op);
}
case Instruction::IntToPtr:
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
case Instruction::BitCast:
case Instruction::GetElementPtr:
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index b7cd12a..4540268 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -817,6 +817,7 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
case UIToFP: return "uitofp";
case SIToFP: return "sitofp";
case IntToPtr: return "inttoptr";
+ case PtrToAddr: return "ptrtoaddr";
case PtrToInt: return "ptrtoint";
case BitCast: return "bitcast";
case AddrSpaceCast: return "addrspacecast";
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index b896382..a1751c0 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2798,6 +2798,7 @@ bool CastInst::isNoopCast(Instruction::CastOps Opcode,
return false;
case Instruction::BitCast:
return true; // BitCast never modifies bits.
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
return DL.getIntPtrType(SrcTy)->getScalarSizeInBits() ==
DestTy->getScalarSizeInBits();
@@ -2855,26 +2856,29 @@ unsigned CastInst::isEliminableCastPair(
// same reason.
const unsigned numCastOps =
Instruction::CastOpsEnd - Instruction::CastOpsBegin;
+ // clang-format off
static const uint8_t CastResults[numCastOps][numCastOps] = {
- // T F F U S F F P I B A -+
- // R Z S P P I I T P 2 N T S |
- // U E E 2 2 2 2 R E I T C C +- secondOp
- // N X X U S F F N X N 2 V V |
- // C T T I I P P C T T P T T -+
- { 1, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // Trunc -+
- { 8, 1, 9,99,99, 2,17,99,99,99, 2, 3, 0}, // ZExt |
- { 8, 0, 1,99,99, 0, 2,99,99,99, 0, 3, 0}, // SExt |
- { 0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToUI |
- { 0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToSI |
- { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // UIToFP +- firstOp
- { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // SIToFP |
- { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // FPTrunc |
- { 99,99,99, 2, 2,99,99, 8, 2,99,99, 4, 0}, // FPExt |
- { 1, 0, 0,99,99, 0, 0,99,99,99, 7, 3, 0}, // PtrToInt |
- { 99,99,99,99,99,99,99,99,99,11,99,15, 0}, // IntToPtr |
- { 5, 5, 5, 0, 0, 5, 5, 0, 0,16, 5, 1,14}, // BitCast |
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
+ // T F F U S F F P P I B A -+
+ // R Z S P P I I T P 2 2 N T S |
+ // U E E 2 2 2 2 R E I A T C C +- secondOp
+ // N X X U S F F N X N D 2 V V |
+ // C T T I I P P C T T R P T T -+
+ { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // Trunc -+
+ { 8, 1, 9,99,99, 2,17,99,99,99,99, 2, 3, 0}, // ZExt |
+ { 8, 0, 1,99,99, 0, 2,99,99,99,99, 0, 3, 0}, // SExt |
+ { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToUI |
+ { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToSI |
+ { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // UIToFP +- firstOp
+ { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // SIToFP |
+ { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // FPTrunc |
+ { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt |
+ { 1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt |
+ { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr |
+ { 99,99,99,99,99,99,99,99,99,11,99,99,15, 0}, // IntToPtr |
+ { 5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast |
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
};
+ // clang-format on
// TODO: This logic could be encoded into the table above and handled in the
// switch below.
@@ -3046,6 +3050,7 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
case SIToFP: return new SIToFPInst (S, Ty, Name, InsertBefore);
case FPToUI: return new FPToUIInst (S, Ty, Name, InsertBefore);
case FPToSI: return new FPToSIInst (S, Ty, Name, InsertBefore);
+ case PtrToAddr: return new PtrToAddrInst (S, Ty, Name, InsertBefore);
case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertBefore);
case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertBefore);
case BitCast:
@@ -3347,6 +3352,7 @@ CastInst::castIsValid(Instruction::CastOps op, Type *SrcTy, Type *DstTy) {
case Instruction::FPToSI:
return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy() &&
SrcEC == DstEC;
+ case Instruction::PtrToAddr:
case Instruction::PtrToInt:
if (SrcEC != DstEC)
return false;
@@ -3460,6 +3466,12 @@ PtrToIntInst::PtrToIntInst(Value *S, Type *Ty, const Twine &Name,
assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
}
+PtrToAddrInst::PtrToAddrInst(Value *S, Type *Ty, const Twine &Name,
+ InsertPosition InsertBefore)
+ : CastInst(Ty, PtrToAddr, S, Name, InsertBefore) {
+ assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToAddr");
+}
+
IntToPtrInst::IntToPtrInst(Value *S, Type *Ty, const Twine &Name,
InsertPosition InsertBefore)
: CastInst(Ty, IntToPtr, S, Name, InsertBefore) {
@@ -4427,6 +4439,10 @@ PtrToIntInst *PtrToIntInst::cloneImpl() const {
return new PtrToIntInst(getOperand(0), getType());
}
+PtrToAddrInst *PtrToAddrInst::cloneImpl() const {
+ return new PtrToAddrInst(getOperand(0), getType());
+}
+
IntToPtrInst *IntToPtrInst::cloneImpl() const {
return new IntToPtrInst(getOperand(0), getType());
}
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index aa2a60e..e03f993 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -312,10 +312,8 @@ template <> struct MDNodeKeyImpl<MDTuple> : MDNodeOpsKey {
template <> struct MDNodeKeyImpl<DILocation> {
Metadata *Scope;
Metadata *InlinedAt;
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
uint64_t AtomGroup : 61;
uint64_t AtomRank : 3;
-#endif
unsigned Line;
uint16_t Column;
bool ImplicitCode;
@@ -323,36 +321,24 @@ template <> struct MDNodeKeyImpl<DILocation> {
MDNodeKeyImpl(unsigned Line, uint16_t Column, Metadata *Scope,
Metadata *InlinedAt, bool ImplicitCode, uint64_t AtomGroup,
uint8_t AtomRank)
- : Scope(Scope), InlinedAt(InlinedAt),
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
- AtomGroup(AtomGroup), AtomRank(AtomRank),
-#endif
- Line(Line), Column(Column), ImplicitCode(ImplicitCode) {
- }
+ : Scope(Scope), InlinedAt(InlinedAt), AtomGroup(AtomGroup),
+ AtomRank(AtomRank), Line(Line), Column(Column),
+ ImplicitCode(ImplicitCode) {}
MDNodeKeyImpl(const DILocation *L)
: Scope(L->getRawScope()), InlinedAt(L->getRawInlinedAt()),
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
AtomGroup(L->getAtomGroup()), AtomRank(L->getAtomRank()),
-#endif
Line(L->getLine()), Column(L->getColumn()),
- ImplicitCode(L->isImplicitCode()) {
- }
+ ImplicitCode(L->isImplicitCode()) {}
bool isKeyOf(const DILocation *RHS) const {
return Line == RHS->getLine() && Column == RHS->getColumn() &&
Scope == RHS->getRawScope() && InlinedAt == RHS->getRawInlinedAt() &&
- ImplicitCode == RHS->isImplicitCode()
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
- && AtomGroup == RHS->getAtomGroup() &&
- AtomRank == RHS->getAtomRank();
-#else
- ;
-#endif
+ ImplicitCode == RHS->isImplicitCode() &&
+ AtomGroup == RHS->getAtomGroup() && AtomRank == RHS->getAtomRank();
}
unsigned getHashValue() const {
-#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
// Hashing AtomGroup and AtomRank substantially impacts performance whether
// Key Instructions is enabled or not. We can't detect whether it's enabled
// here cheaply; avoiding hashing zero values is a good approximation. This
@@ -363,7 +349,6 @@ template <> struct MDNodeKeyImpl<DILocation> {
if (AtomGroup || AtomRank)
return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode,
AtomGroup, (uint8_t)AtomRank);
-#endif
return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode);
}
};
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 129ca4a..5928c89 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -747,34 +747,28 @@ const Value *Value::stripAndAccumulateConstantOffsets(
// means when we construct GEPOffset, we need to use the size
// of GEP's pointer type rather than the size of the original
// pointer type.
- unsigned CurBitWidth = DL.getIndexTypeSizeInBits(V->getType());
- if (CurBitWidth == BitWidth) {
- if (!GEP->accumulateConstantOffset(DL, Offset, ExternalAnalysis))
- return V;
- } else {
- APInt GEPOffset(CurBitWidth, 0);
- if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis))
- return V;
+ APInt GEPOffset(DL.getIndexTypeSizeInBits(V->getType()), 0);
+ if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis))
+ return V;
- // Stop traversal if the pointer offset wouldn't fit in the bit-width
- // provided by the Offset argument. This can happen due to AddrSpaceCast
- // stripping.
- if (GEPOffset.getSignificantBits() > BitWidth)
- return V;
+ // Stop traversal if the pointer offset wouldn't fit in the bit-width
+ // provided by the Offset argument. This can happen due to AddrSpaceCast
+ // stripping.
+ if (GEPOffset.getSignificantBits() > BitWidth)
+ return V;
- // External Analysis can return a result higher/lower than the value
- // represents. We need to detect overflow/underflow.
- APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth);
- if (!ExternalAnalysis) {
- Offset += GEPOffsetST;
- } else {
- bool Overflow = false;
- APInt OldOffset = Offset;
- Offset = Offset.sadd_ov(GEPOffsetST, Overflow);
- if (Overflow) {
- Offset = OldOffset;
- return V;
- }
+ // External Analysis can return a result higher/lower than the value
+ // represents. We need to detect overflow/underflow.
+ APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth);
+ if (!ExternalAnalysis) {
+ Offset += GEPOffsetST;
+ } else {
+ bool Overflow = false;
+ APInt OldOffset = Offset;
+ Offset = Offset.sadd_ov(GEPOffsetST, Overflow);
+ if (Overflow) {
+ Offset = OldOffset;
+ return V;
}
}
V = GEP->getPointerOperand();
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index f5dcb5e..1d3c379 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -566,6 +566,8 @@ private:
void visitUIToFPInst(UIToFPInst &I);
void visitSIToFPInst(SIToFPInst &I);
void visitIntToPtrInst(IntToPtrInst &I);
+ void checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V);
+ void visitPtrToAddrInst(PtrToAddrInst &I);
void visitPtrToIntInst(PtrToIntInst &I);
void visitBitCastInst(BitCastInst &I);
void visitAddrSpaceCastInst(AddrSpaceCastInst &I);
@@ -834,6 +836,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
&GV);
Check(GV.getInitializer()->getType()->isSized(),
"Global variable initializer must be sized", &GV);
+ visitConstantExprsRecursively(GV.getInitializer());
// If the global has common linkage, it must have a zero initializer and
// cannot be constant.
if (GV.hasCommonLinkage()) {
@@ -2610,6 +2613,8 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) {
Check(CastInst::castIsValid(Instruction::BitCast, CE->getOperand(0),
CE->getType()),
"Invalid bitcast", CE);
+ else if (CE->getOpcode() == Instruction::PtrToAddr)
+ checkPtrToAddr(CE->getOperand(0)->getType(), CE->getType(), *CE);
}
void Verifier::visitConstantPtrAuth(const ConstantPtrAuth *CPA) {
@@ -3532,6 +3537,28 @@ void Verifier::visitFPToSIInst(FPToSIInst &I) {
visitInstruction(I);
}
+void Verifier::checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V) {
+ Check(SrcTy->isPtrOrPtrVectorTy(), "PtrToAddr source must be pointer", V);
+ Check(DestTy->isIntOrIntVectorTy(), "PtrToAddr result must be integral", V);
+ Check(SrcTy->isVectorTy() == DestTy->isVectorTy(), "PtrToAddr type mismatch",
+ V);
+
+ if (SrcTy->isVectorTy()) {
+ auto *VSrc = cast<VectorType>(SrcTy);
+ auto *VDest = cast<VectorType>(DestTy);
+ Check(VSrc->getElementCount() == VDest->getElementCount(),
+ "PtrToAddr vector length mismatch", V);
+ }
+
+ Type *AddrTy = DL.getAddressType(SrcTy);
+ Check(AddrTy == DestTy, "PtrToAddr result must be address width", V);
+}
+
+void Verifier::visitPtrToAddrInst(PtrToAddrInst &I) {
+ checkPtrToAddr(I.getOperand(0)->getType(), I.getType(), I);
+ visitInstruction(I);
+}
+
void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
// Get the source and destination types
Type *SrcTy = I.getOperand(0)->getType();
@@ -3547,7 +3574,7 @@ void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
auto *VSrc = cast<VectorType>(SrcTy);
auto *VDest = cast<VectorType>(DestTy);
Check(VSrc->getElementCount() == VDest->getElementCount(),
- "PtrToInt Vector width mismatch", &I);
+ "PtrToInt Vector length mismatch", &I);
}
visitInstruction(I);
@@ -3567,7 +3594,7 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
auto *VSrc = cast<VectorType>(SrcTy);
auto *VDest = cast<VectorType>(DestTy);
Check(VSrc->getElementCount() == VDest->getElementCount(),
- "IntToPtr Vector width mismatch", &I);
+ "IntToPtr Vector length mismatch", &I);
}
visitInstruction(I);
}
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 7ca26aa..df807fc 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -331,61 +331,34 @@ void InstrProfWriter::addDataAccessProfData(
DataAccessProfileData = std::move(DataAccessProfDataIn);
}
-void InstrProfWriter::addTemporalProfileTrace(TemporalProfTraceTy Trace) {
- assert(Trace.FunctionNameRefs.size() <= MaxTemporalProfTraceLength);
- assert(!Trace.FunctionNameRefs.empty());
- if (TemporalProfTraceStreamSize < TemporalProfTraceReservoirSize) {
- // Simply append the trace if we have not yet hit our reservoir size limit.
- TemporalProfTraces.push_back(std::move(Trace));
- } else {
- // Otherwise, replace a random trace in the stream.
- std::uniform_int_distribution<uint64_t> Distribution(
- 0, TemporalProfTraceStreamSize);
- uint64_t RandomIndex = Distribution(RNG);
- if (RandomIndex < TemporalProfTraces.size())
- TemporalProfTraces[RandomIndex] = std::move(Trace);
- }
- ++TemporalProfTraceStreamSize;
-}
-
void InstrProfWriter::addTemporalProfileTraces(
SmallVectorImpl<TemporalProfTraceTy> &SrcTraces, uint64_t SrcStreamSize) {
+ if (TemporalProfTraces.size() > TemporalProfTraceReservoirSize)
+ TemporalProfTraces.truncate(TemporalProfTraceReservoirSize);
for (auto &Trace : SrcTraces)
if (Trace.FunctionNameRefs.size() > MaxTemporalProfTraceLength)
Trace.FunctionNameRefs.resize(MaxTemporalProfTraceLength);
llvm::erase_if(SrcTraces, [](auto &T) { return T.FunctionNameRefs.empty(); });
- // Assume that the source has the same reservoir size as the destination to
- // avoid needing to record it in the indexed profile format.
- bool IsDestSampled =
- (TemporalProfTraceStreamSize > TemporalProfTraceReservoirSize);
- bool IsSrcSampled = (SrcStreamSize > TemporalProfTraceReservoirSize);
- if (!IsDestSampled && IsSrcSampled) {
- // If one of the traces are sampled, ensure that it belongs to Dest.
- std::swap(TemporalProfTraces, SrcTraces);
- std::swap(TemporalProfTraceStreamSize, SrcStreamSize);
- std::swap(IsDestSampled, IsSrcSampled);
- }
- if (!IsSrcSampled) {
- // If the source stream is not sampled, we add each source trace normally.
- for (auto &Trace : SrcTraces)
- addTemporalProfileTrace(std::move(Trace));
+ // If there are no source traces, it is probably because
+ // --temporal-profile-max-trace-length=0 was set to deliberately remove all
+ // traces. In that case, we do not want to increase the stream size
+ if (SrcTraces.empty())
return;
- }
- // Otherwise, we find the traces that would have been removed if we added
- // the whole source stream.
- SmallSetVector<uint64_t, 8> IndicesToReplace;
- for (uint64_t I = 0; I < SrcStreamSize; I++) {
- std::uniform_int_distribution<uint64_t> Distribution(
- 0, TemporalProfTraceStreamSize);
+ // Add traces until our reservoir is full or we run out of source traces
+ auto SrcTraceIt = SrcTraces.begin();
+ while (TemporalProfTraces.size() < TemporalProfTraceReservoirSize &&
+ SrcTraceIt < SrcTraces.end())
+ TemporalProfTraces.push_back(*SrcTraceIt++);
+ // Our reservoir is full, we need to sample the source stream
+ llvm::shuffle(SrcTraceIt, SrcTraces.end(), RNG);
+ for (uint64_t I = TemporalProfTraces.size();
+ I < SrcStreamSize && SrcTraceIt < SrcTraces.end(); I++) {
+ std::uniform_int_distribution<uint64_t> Distribution(0, I);
uint64_t RandomIndex = Distribution(RNG);
if (RandomIndex < TemporalProfTraces.size())
- IndicesToReplace.insert(RandomIndex);
- ++TemporalProfTraceStreamSize;
+ TemporalProfTraces[RandomIndex] = *SrcTraceIt++;
}
- // Then we insert a random sample of the source traces.
- llvm::shuffle(SrcTraces.begin(), SrcTraces.end(), RNG);
- for (const auto &[Index, Trace] : llvm::zip(IndicesToReplace, SrcTraces))
- TemporalProfTraces[Index] = std::move(Trace);
+ TemporalProfTraceStreamSize += SrcStreamSize;
}
void InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW,
diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp
index fe34037..70ac68a 100644
--- a/llvm/lib/SandboxIR/Context.cpp
+++ b/llvm/lib/SandboxIR/Context.cpp
@@ -256,6 +256,7 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
case llvm::Instruction::FPToUI:
case llvm::Instruction::FPToSI:
case llvm::Instruction::FPExt:
+ case llvm::Instruction::PtrToAddr:
case llvm::Instruction::PtrToInt:
case llvm::Instruction::IntToPtr:
case llvm::Instruction::SIToFP:
diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp
index 956047c..1a81d18 100644
--- a/llvm/lib/SandboxIR/Instruction.cpp
+++ b/llvm/lib/SandboxIR/Instruction.cpp
@@ -1007,6 +1007,9 @@ static llvm::Instruction::CastOps getLLVMCastOp(Instruction::Opcode Opc) {
return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPToSI);
case Instruction::Opcode::FPExt:
return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPExt);
+ case Instruction::Opcode::PtrToAddr:
+ return static_cast<llvm::Instruction::CastOps>(
+ llvm::Instruction::PtrToAddr);
case Instruction::Opcode::PtrToInt:
return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::PtrToInt);
case Instruction::Opcode::IntToPtr:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9f05add..5c94aeb 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -554,7 +554,17 @@ static bool isUnpackedVectorVT(EVT VecVT) {
VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
}
-static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
+static InstructionCost getHistogramCost(const AArch64Subtarget *ST,
+ const IntrinsicCostAttributes &ICA) {
+ // We need to know at least the number of elements in the vector of buckets
+ // and the size of each element to update.
+ if (ICA.getArgTypes().size() < 2)
+ return InstructionCost::getInvalid();
+
+ // Only interested in costing for the hardware instruction from SVE2.
+ if (!ST->hasSVE2())
+ return InstructionCost::getInvalid();
+
Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
unsigned TotalHistCnts = 1;
@@ -579,9 +589,11 @@ static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
TotalHistCnts = EC / NaturalVectorWidth;
+
+ return InstructionCost(BaseHistCntCost * TotalHistCnts);
}
- return InstructionCost(BaseHistCntCost * TotalHistCnts);
+ return InstructionCost::getInvalid();
}
InstructionCost
@@ -597,10 +609,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return InstructionCost::getInvalid();
switch (ICA.getID()) {
- case Intrinsic::experimental_vector_histogram_add:
- if (!ST->hasSVE2())
- return InstructionCost::getInvalid();
- return getHistogramCost(ICA);
+ case Intrinsic::experimental_vector_histogram_add: {
+ InstructionCost HistCost = getHistogramCost(ST, ICA);
+ // If the cost isn't valid, we may still be able to scalarize
+ if (HistCost.isValid())
+ return HistCost;
+ break;
+ }
case Intrinsic::umin:
case Intrinsic::umax:
case Intrinsic::smin:
@@ -3975,6 +3990,27 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead(
return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
}
+std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
+ Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
+ TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
+ std::function<InstructionCost(Type *)> InstCost) const {
+ if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
+ return std::nullopt;
+ if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
+ return std::nullopt;
+
+ Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
+ InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
+ TTI::CastContextHint::None, CostKind);
+ if (!Op1Info.isConstant() && !Op2Info.isConstant())
+ Cost *= 2;
+ Cost += InstCost(PromotedTy);
+ if (IncludeTrunc)
+ Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
+ TTI::CastContextHint::None, CostKind);
+ return Cost;
+}
+
InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
@@ -3997,6 +4033,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ // Increase the cost for half and bfloat types if not architecturally
+ // supported.
+ if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
+ ISD == ISD::FDIV || ISD == ISD::FREM)
+ if (auto PromotedCost = getFP16BF16PromoteCost(
+ Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
+ [&](Type *PromotedTy) {
+ return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
+ Op1Info, Op2Info);
+ }))
+ return *PromotedCost;
+
switch (ISD) {
default:
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
@@ -4265,11 +4313,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
[[fallthrough]];
case ISD::FADD:
case ISD::FSUB:
- // Increase the cost for half and bfloat types if not architecturally
- // supported.
- if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
- (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
- return 2 * LT.first;
if (!Ty->getScalarType()->isFP128Ty())
return LT.first;
[[fallthrough]];
@@ -4371,25 +4414,21 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
}
if (Opcode == Instruction::FCmp) {
- // Without dedicated instructions we promote f16 + bf16 compares to f32.
- if ((!ST->hasFullFP16() && ValTy->getScalarType()->isHalfTy()) ||
- ValTy->getScalarType()->isBFloatTy()) {
- Type *PromotedTy =
- ValTy->getWithNewType(Type::getFloatTy(ValTy->getContext()));
- InstructionCost Cost =
- getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy,
- TTI::CastContextHint::None, CostKind);
- if (!Op1Info.isConstant() && !Op2Info.isConstant())
- Cost *= 2;
- Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind,
- Op1Info, Op2Info);
- if (ValTy->isVectorTy())
- Cost += getCastInstrCost(
- Instruction::Trunc, VectorType::getInteger(cast<VectorType>(ValTy)),
- VectorType::getInteger(cast<VectorType>(PromotedTy)),
- TTI::CastContextHint::None, CostKind);
- return Cost;
- }
+ if (auto PromotedCost = getFP16BF16PromoteCost(
+ ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
+ [&](Type *PromotedTy) {
+ InstructionCost Cost =
+ getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
+ CostKind, Op1Info, Op2Info);
+ if (isa<VectorType>(PromotedTy))
+ Cost += getCastInstrCost(
+ Instruction::Trunc,
+ VectorType::getInteger(cast<VectorType>(ValTy)),
+ VectorType::getInteger(cast<VectorType>(PromotedTy)),
+ TTI::CastContextHint::None, CostKind);
+ return Cost;
+ }))
+ return *PromotedCost;
auto LT = getTypeLegalizationCost(ValTy);
// Model unknown fp compares as a libcall.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 7f45177..fa9b25a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -435,6 +435,14 @@ public:
bool preferPredicatedReductionSelect() const override { return ST->hasSVE(); }
+ /// FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the
+ /// architecture features are not present.
+ std::optional<InstructionCost>
+ getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind,
+ TTI::OperandValueInfo Op1Info,
+ TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
+ std::function<InstructionCost(Type *)> InstCost) const;
+
InstructionCost
getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
std::optional<FastMathFlags> FMF,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 2a324e5..626734a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -997,89 +997,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const Function &F = MF.getFunction();
// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
- // dispatch registers are function args.
- unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
-
- if (isShader(F.getCallingConv())) {
- bool IsPixelShader =
- F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
-
- // Calculate the number of VGPR registers based on the SPI input registers
- uint32_t InputEna = 0;
- uint32_t InputAddr = 0;
- unsigned LastEna = 0;
-
- if (IsPixelShader) {
- // Note for IsPixelShader:
- // By this stage, all enabled inputs are tagged in InputAddr as well.
- // We will use InputAddr to determine whether the input counts against the
- // vgpr total and only use the InputEnable to determine the last input
- // that is relevant - if extra arguments are used, then we have to honour
- // the InputAddr for any intermediate non-enabled inputs.
- InputEna = MFI->getPSInputEnable();
- InputAddr = MFI->getPSInputAddr();
-
- // We only need to consider input args up to the last used arg.
- assert((InputEna || InputAddr) &&
- "PSInputAddr and PSInputEnable should "
- "never both be 0 for AMDGPU_PS shaders");
- // There are some rare circumstances where InputAddr is non-zero and
- // InputEna can be set to 0. In this case we default to setting LastEna
- // to 1.
- LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
- }
+ // dispatch registers as function args.
+ unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
+ WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
- // FIXME: We should be using the number of registers determined during
- // calling convention lowering to legalize the types.
- const DataLayout &DL = F.getDataLayout();
- unsigned PSArgCount = 0;
- unsigned IntermediateVGPR = 0;
- for (auto &Arg : F.args()) {
- unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
- if (Arg.hasAttribute(Attribute::InReg)) {
- WaveDispatchNumSGPR += NumRegs;
- } else {
- // If this is a PS shader and we're processing the PS Input args (first
- // 16 VGPR), use the InputEna and InputAddr bits to define how many
- // VGPRs are actually used.
- // Any extra VGPR arguments are handled as normal arguments (and
- // contribute to the VGPR count whether they're used or not).
- if (IsPixelShader && PSArgCount < 16) {
- if ((1 << PSArgCount) & InputAddr) {
- if (PSArgCount < LastEna)
- WaveDispatchNumVGPR += NumRegs;
- else
- IntermediateVGPR += NumRegs;
- }
- PSArgCount++;
- } else {
- // If there are extra arguments we have to include the allocation for
- // the non-used (but enabled with InputAddr) input arguments
- if (IntermediateVGPR) {
- WaveDispatchNumVGPR += IntermediateVGPR;
- IntermediateVGPR = 0;
- }
- WaveDispatchNumVGPR += NumRegs;
- }
- }
- }
+ if (WaveDispatchNumSGPR) {
ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
- {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
+ {ProgInfo.NumSGPR,
+ MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
+ Ctx)},
+ Ctx);
+ }
+ if (WaveDispatchNumVGPR) {
ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
{ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
- } else if (isKernel(F.getCallingConv()) &&
- MFI->getNumKernargPreloadedSGPRs()) {
- // Consider cases where the total number of UserSGPRs with trailing
- // allocated preload SGPRs, is greater than the number of explicitly
- // referenced SGPRs.
- const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
- CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
- ProgInfo.NumSGPR =
- AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
}
// Adjust number of registers used to meet default/requested minimum/maximum
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 3d8d274..64a9bde 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
++i;
}
+ if (Info->getNumKernargPreloadedSGPRs())
+ Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
+
TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
return true;
@@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (!determineAssignments(Assigner, SplitArgs, CCInfo))
return false;
+ if (IsEntryFunc) {
+ // This assumes the registers are allocated by CCInfo in ascending order
+ // with no gaps.
+ Info->setNumWaveDispatchSGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+ Info->setNumWaveDispatchVGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+ }
+
FormalArgHandler Handler(B, MRI);
if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
return false;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 334afd3..ef63acc 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -368,46 +368,45 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
////////////////////////////////////////////////////////////////////////////////
// GCNRPTarget
-GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
const Function &F = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF);
+ setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F));
}
GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs,
- const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
- setRegLimits(NumSGPRs, NumVGPRs, MF);
+ const MachineFunction &MF, const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
+ setTarget(NumSGPRs, NumVGPRs);
}
GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+ const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
- setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
- ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF);
+ setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
+ ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize));
}
-void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs,
- const MachineFunction &MF) {
+void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- unsigned DynamicVGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs);
MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs);
- MaxUnifiedVGPRs =
- ST.hasGFX90AInsts()
- ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs)
- : 0;
+ if (UnifiedRF) {
+ unsigned DynamicVGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ MaxUnifiedVGPRs =
+ std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs);
+ } else {
+ MaxUnifiedVGPRs = 0;
+ }
}
-bool GCNRPTarget::isSaveBeneficial(Register Reg,
- const MachineRegisterInfo &MRI) const {
+bool GCNRPTarget::isSaveBeneficial(Register Reg) const {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
@@ -416,16 +415,19 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg,
return RP.getSGPRNum() > MaxSGPRs;
unsigned NumVGPRs =
SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum();
- return isVGPRBankSaveBeneficial(NumVGPRs);
+ // The addressable limit must always be respected.
+ if (NumVGPRs > MaxVGPRs)
+ return true;
+ // For unified RFs, combined VGPR usage limit must be respected as well.
+ return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs;
}
bool GCNRPTarget::satisfied() const {
- if (RP.getSGPRNum() > MaxSGPRs)
+ if (RP.getSGPRNum() > MaxSGPRs || RP.getVGPRNum(false) > MaxVGPRs)
return false;
- if (RP.getVGPRNum(false) > MaxVGPRs &&
- (!CombineVGPRSavings || !satisifiesVGPRBanksTarget()))
+ if (UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs)
return false;
- return satisfiesUnifiedTarget();
+ return true;
}
///////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index ea33a22..a9c58bb 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -186,20 +186,22 @@ public:
/// Sets up the target such that the register pressure starting at \p RP does
/// not show register spilling on function \p MF (w.r.t. the function's
/// mininum target occupancy).
- GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings = false);
+ GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP);
/// Sets up the target such that the register pressure starting at \p RP does
/// not use more than \p NumSGPRs SGPRs and \p NumVGPRs VGPRs on function \p
/// MF.
GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings = false);
+ const GCNRegPressure &RP);
/// Sets up the target such that the register pressure starting at \p RP does
/// not prevent achieving an occupancy of at least \p Occupancy on function
/// \p MF.
GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings = false);
+ const GCNRegPressure &RP);
+
+ /// Changes the target (same semantics as constructor).
+ void setTarget(unsigned NumSGPRs, unsigned NumVGPRs);
const GCNRegPressure &getCurrentRP() const { return RP; }
@@ -207,7 +209,7 @@ public:
/// Determines whether saving virtual register \p Reg will be beneficial
/// towards achieving the RP target.
- bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const;
+ bool isSaveBeneficial(Register Reg) const;
/// Saves virtual register \p Reg with lanemask \p Mask.
void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) {
@@ -227,15 +229,15 @@ public:
if (Target.MaxUnifiedVGPRs) {
OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs
<< " VGPRs (unified)";
- } else if (Target.CombineVGPRSavings) {
- OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/'
- << 2 * Target.MaxVGPRs << " VGPRs (combined target)";
}
return OS;
}
#endif
private:
+ const MachineFunction &MF;
+ const bool UnifiedRF;
+
/// Current register pressure.
GCNRegPressure RP;
@@ -246,29 +248,10 @@ private:
/// Target number of overall VGPRs for subtargets with unified RFs. Always 0
/// for subtargets with non-unified RFs.
unsigned MaxUnifiedVGPRs;
- /// Whether we consider that the register allocator will be able to swap
- /// between ArchVGPRs and AGPRs by copying them to a super register class.
- /// Concretely, this allows savings in one of the VGPR banks to help toward
- /// savings in the other VGPR bank.
- bool CombineVGPRSavings;
-
- inline bool satisifiesVGPRBanksTarget() const {
- assert(CombineVGPRSavings && "only makes sense with combined savings");
- return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs;
- }
-
- /// Always satisified when the subtarget doesn't have a unified RF.
- inline bool satisfiesUnifiedTarget() const {
- return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs;
- }
-
- inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const {
- return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() ||
- (CombineVGPRSavings && !satisifiesVGPRBanksTarget());
- }
- void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs,
- const MachineFunction &MF);
+ GCNRPTarget(const GCNRegPressure &RP, const MachineFunction &MF)
+ : MF(MF), UnifiedRF(MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()),
+ RP(RP) {}
};
///////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 96d5668..254b75b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1086,7 +1086,8 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
}
/// Allows to easily filter for this stage's debug output.
-#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
+#define REMAT_PREFIX "[PreRARemat] "
+#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
bool PreRARematStage::initGCNSchedStage() {
// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
@@ -1115,10 +1116,15 @@ bool PreRARematStage::initGCNSchedStage() {
rematerialize();
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
- REMAT_DEBUG(
- dbgs() << "Retrying function scheduling with new min. occupancy of "
- << AchievedOcc << " from rematerializing (original was "
- << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
+ REMAT_DEBUG({
+ dbgs() << "Retrying function scheduling with new min. occupancy of "
+ << AchievedOcc << " from rematerializing (original was "
+ << DAG.MinOccupancy;
+ if (TargetOcc)
+ dbgs() << ", target was " << *TargetOcc;
+ dbgs() << ")\n";
+ });
+
if (AchievedOcc > DAG.MinOccupancy) {
DAG.MinOccupancy = AchievedOcc;
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -1540,8 +1546,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
- mayCauseSpilling(WavesAfter) ||
- (IncreaseOccupancy && WavesAfter < TargetOcc);
+ mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
}
bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@@ -1687,78 +1692,63 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
}
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
- REMAT_DEBUG({
- dbgs() << "Collecting rematerializable instructions in ";
- MF.getFunction().printAsOperand(dbgs(), false);
- dbgs() << '\n';
- });
+ const Function &F = MF.getFunction();
// Maps optimizable regions (i.e., regions at minimum and register-limited
// occupancy, or regions with spilling) to the target RP we would like to
// reach.
DenseMap<unsigned, GCNRPTarget> OptRegions;
- const Function &F = MF.getFunction();
- unsigned DynamicVGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
-
- std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
- const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
- const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
- const unsigned MaxSGPRsIncOcc =
- ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
- const unsigned MaxVGPRsIncOcc =
- ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
- IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
-
- // Collect optimizable regions. If there is spilling in any region we will
- // just try to reduce spilling. Otherwise we will try to increase occupancy by
- // one in the whole function.
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- GCNRegPressure &RP = DAG.Pressure[I];
- // We allow ArchVGPR or AGPR savings to count as savings of the other kind
- // of VGPR only when trying to eliminate spilling. We cannot do this when
- // trying to increase occupancy since VGPR class swaps only occur later in
- // the register allocator i.e., the scheduler will not be able to reason
- // about these savings and will not report an increase in the achievable
- // occupancy, triggering rollbacks.
- GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP,
- /*CombineVGPRSavings=*/true);
- if (!Target.satisfied() && IncreaseOccupancy) {
- // There is spilling in the region and we were so far trying to increase
- // occupancy. Strop trying that and focus on reducing spilling.
- IncreaseOccupancy = false;
- OptRegions.clear();
- } else if (IncreaseOccupancy) {
- // There is no spilling in the region, try to increase occupancy.
- Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP,
- /*CombineVGPRSavings=*/false);
+ unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
+ unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
+ auto ResetTargetRegions = [&]() {
+ OptRegions.clear();
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ const GCNRegPressure &RP = DAG.Pressure[I];
+ GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
+ if (!Target.satisfied())
+ OptRegions.insert({I, Target});
}
- if (!Target.satisfied())
- OptRegions.insert({I, Target});
- }
- if (OptRegions.empty())
- return false;
+ };
-#ifndef NDEBUG
- if (IncreaseOccupancy) {
- REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy
- << ") in regions:\n");
+ ResetTargetRegions();
+ if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
+ // In addition to register usage being above addressable limits, occupancy
+ // below the minimum is considered like "spilling" as well.
+ TargetOcc = std::nullopt;
} else {
- REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy ("
- << WavesPerEU.first << ") in regions:\n");
- }
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
- REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n');
+ // There is no spilling and room to improve occupancy; set up "increased
+ // occupancy targets" for all regions.
+ TargetOcc = DAG.MinOccupancy + 1;
+ unsigned VGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
+ MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
+ ResetTargetRegions();
}
-#endif
-
- // When we are reducing spilling, the target is the minimum target number of
- // waves/EU determined by the subtarget. In cases where either one of
- // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
- // minimum region occupancy may be higher than the latter.
- TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1
- : std::max(DAG.MinOccupancy, WavesPerEU.first);
+ REMAT_DEBUG({
+ dbgs() << "Analyzing ";
+ MF.getFunction().printAsOperand(dbgs(), false);
+ dbgs() << ": ";
+ if (OptRegions.empty()) {
+ dbgs() << "no objective to achieve, occupancy is maximal at "
+ << MFI.getMaxWavesPerEU();
+ } else if (!TargetOcc) {
+ dbgs() << "reduce spilling (minimum target occupancy is "
+ << MFI.getMinWavesPerEU() << ')';
+ } else {
+ dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
+ << TargetOcc;
+ }
+ dbgs() << '\n';
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
+ dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond()
+ << '\n';
+ }
+ }
+ });
+ if (OptRegions.empty())
+ return false;
// Accounts for a reduction in RP in an optimizable region. Returns whether we
// estimate that we have identified enough rematerialization opportunities to
@@ -1767,7 +1757,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
bool &Progress) -> bool {
GCNRPTarget &Target = OptIt->getSecond();
- if (!Target.isSaveBeneficial(Reg, DAG.MRI))
+ if (!Target.isSaveBeneficial(Reg))
return false;
Progress = true;
Target.saveReg(Reg, Mask, DAG.MRI);
@@ -1876,7 +1866,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
}
}
- if (IncreaseOccupancy) {
+ if (TargetOcc) {
// We were trying to increase occupancy but failed, abort the stage.
REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
Rematerializations.clear();
@@ -1979,7 +1969,9 @@ void PreRARematStage::rematerialize() {
// All regions impacted by at least one rematerialization must be rescheduled.
// Maximum pressure must also be recomputed for all regions where it changed
// non-predictably and checked against the target occupancy.
- AchievedOcc = TargetOcc;
+ unsigned DynamicVGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ AchievedOcc = MFI.getMaxWavesPerEU();
for (auto &[I, OriginalRP] : ImpactedRegions) {
bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
RescheduleRegions[I] = !IsEmptyRegion;
@@ -2003,9 +1995,8 @@ void PreRARematStage::rematerialize() {
}
}
DAG.Pressure[I] = RP;
- AchievedOcc = std::min(
- AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
- ->getDynamicVGPRBlockSize()));
+ AchievedOcc =
+ std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
}
REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
}
@@ -2035,7 +2026,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
// which case we do not want to rollback either (the rescheduling was already
// reverted in PreRARematStage::shouldRevertScheduling in such cases).
unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
- if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
+ if (!TargetOcc || MaxOcc >= *TargetOcc)
return;
REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 32139a9..790370f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -470,15 +470,12 @@ private:
/// After successful stage initialization, indicates which regions should be
/// rescheduled.
BitVector RescheduleRegions;
- /// Target occupancy the stage estimates is reachable through
- /// rematerialization. Greater than or equal to the pre-stage min occupancy.
- unsigned TargetOcc;
+ /// The target occupancy the stage is trying to achieve. Empty when the
+ /// objective is spilling reduction.
+ std::optional<unsigned> TargetOcc;
/// Achieved occupancy *only* through rematerializations (pre-rescheduling).
/// Smaller than or equal to the target occupancy.
unsigned AchievedOcc;
- /// Whether the stage is attempting to increase occupancy in the abscence of
- /// spilling.
- bool IncreaseOccupancy;
/// Returns whether remat can reduce spilling or increase function occupancy
/// by 1 through rematerialization. If it can do one, collects instructions in
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b327fb..1b7d65a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments(
if (!IsKernel) {
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
+
+ // This assumes the registers are allocated by CCInfo in ascending order
+ // with no gaps.
+ Info->setNumWaveDispatchSGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+ Info->setNumWaveDispatchVGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+ } else if (Info->getNumKernargPreloadedSGPRs()) {
+ Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
}
SmallVector<SDValue, 16> Chains;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 9a1448f..49425d5 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
+ NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
+ NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
Occupancy(MFI.getOccupancy()),
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
@@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
WaveLimiter = YamlMFI.WaveLimiter;
HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
+ NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
+ NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
ReturnsVoid = YamlMFI.ReturnsVoid;
IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 08b0206..ca8f803 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -270,6 +270,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
bool WaveLimiter = false;
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
+ uint16_t NumWaveDispatchSGPRs = 0;
+ uint16_t NumWaveDispatchVGPRs = 0;
uint32_t HighBitsOf32BitAddress = 0;
// TODO: 10 may be a better default since it's the maximum.
@@ -327,6 +329,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false);
YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false);
+ YamlIO.mapOptional("numWaveDispatchSGPRs", MFI.NumWaveDispatchSGPRs, false);
+ YamlIO.mapOptional("numWaveDispatchVGPRs", MFI.NumWaveDispatchVGPRs, false);
YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
StringValue("$private_rsrc_reg"));
YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
@@ -465,6 +469,9 @@ private:
unsigned NumUserSGPRs = 0;
unsigned NumSystemSGPRs = 0;
+ unsigned NumWaveDispatchSGPRs = 0;
+ unsigned NumWaveDispatchVGPRs = 0;
+
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
bool HasNonSpillStackObjects = false;
@@ -991,6 +998,14 @@ public:
return UserSGPRInfo.getNumKernargPreloadSGPRs();
}
+ unsigned getNumWaveDispatchSGPRs() const { return NumWaveDispatchSGPRs; }
+
+ void setNumWaveDispatchSGPRs(unsigned Count) { NumWaveDispatchSGPRs = Count; }
+
+ unsigned getNumWaveDispatchVGPRs() const { return NumWaveDispatchVGPRs; }
+
+ void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; }
+
Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e4aa8b8..e63b937 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1844,6 +1844,17 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3,
/*IsStore*/ true,
/*IsUnitStrided*/ false, /*UsePtrVal*/ true);
+ case Intrinsic::riscv_sseg2_store_mask:
+ case Intrinsic::riscv_sseg3_store_mask:
+ case Intrinsic::riscv_sseg4_store_mask:
+ case Intrinsic::riscv_sseg5_store_mask:
+ case Intrinsic::riscv_sseg6_store_mask:
+ case Intrinsic::riscv_sseg7_store_mask:
+ case Intrinsic::riscv_sseg8_store_mask:
+ // Operands are (vec, ..., vec, ptr, offset, mask, vl)
+ return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 4,
+ /*IsStore*/ true,
+ /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
case Intrinsic::riscv_vlm:
return SetRVVLoadStoreInfo(/*PtrOp*/ 0,
/*IsStore*/ false,
@@ -11084,69 +11095,118 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return lowerVectorIntrinsicScalars(Op, DAG, Subtarget);
}
-SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
- SelectionDAG &DAG) const {
- unsigned IntNo = Op.getConstantOperandVal(1);
+static SDValue
+lowerFixedVectorSegStoreIntrinsics(unsigned IntNo, SDValue Op,
+ const RISCVSubtarget &Subtarget,
+ SelectionDAG &DAG) {
+ bool IsStrided;
switch (IntNo) {
- default:
- break;
case Intrinsic::riscv_seg2_store_mask:
case Intrinsic::riscv_seg3_store_mask:
case Intrinsic::riscv_seg4_store_mask:
case Intrinsic::riscv_seg5_store_mask:
case Intrinsic::riscv_seg6_store_mask:
case Intrinsic::riscv_seg7_store_mask:
- case Intrinsic::riscv_seg8_store_mask: {
- SDLoc DL(Op);
- static const Intrinsic::ID VssegInts[] = {
- Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
- Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
- Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
- Intrinsic::riscv_vsseg8_mask};
+ case Intrinsic::riscv_seg8_store_mask:
+ IsStrided = false;
+ break;
+ case Intrinsic::riscv_sseg2_store_mask:
+ case Intrinsic::riscv_sseg3_store_mask:
+ case Intrinsic::riscv_sseg4_store_mask:
+ case Intrinsic::riscv_sseg5_store_mask:
+ case Intrinsic::riscv_sseg6_store_mask:
+ case Intrinsic::riscv_sseg7_store_mask:
+ case Intrinsic::riscv_sseg8_store_mask:
+ IsStrided = true;
+ break;
+ default:
+ llvm_unreachable("unexpected intrinsic ID");
+ }
- // Operands: (chain, int_id, vec*, ptr, mask, vl)
- unsigned NF = Op->getNumOperands() - 5;
- assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
- MVT XLenVT = Subtarget.getXLenVT();
- MVT VT = Op->getOperand(2).getSimpleValueType();
- MVT ContainerVT = getContainerForFixedLengthVector(VT);
- unsigned Sz = NF * ContainerVT.getVectorMinNumElements() *
- ContainerVT.getScalarSizeInBits();
- EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
+ SDLoc DL(Op);
+ static const Intrinsic::ID VssegInts[] = {
+ Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+ Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+ Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+ Intrinsic::riscv_vsseg8_mask};
+ static const Intrinsic::ID VsssegInts[] = {
+ Intrinsic::riscv_vssseg2_mask, Intrinsic::riscv_vssseg3_mask,
+ Intrinsic::riscv_vssseg4_mask, Intrinsic::riscv_vssseg5_mask,
+ Intrinsic::riscv_vssseg6_mask, Intrinsic::riscv_vssseg7_mask,
+ Intrinsic::riscv_vssseg8_mask};
+
+ // Operands: (chain, int_id, vec*, ptr, mask, vl) or
+ // (chain, int_id, vec*, ptr, stride, mask, vl)
+ unsigned NF = Op->getNumOperands() - (IsStrided ? 6 : 5);
+ assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
+ MVT XLenVT = Subtarget.getXLenVT();
+ MVT VT = Op->getOperand(2).getSimpleValueType();
+ MVT ContainerVT = ::getContainerForFixedLengthVector(DAG, VT, Subtarget);
+ unsigned Sz = NF * ContainerVT.getVectorMinNumElements() *
+ ContainerVT.getScalarSizeInBits();
+ EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
- SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
- SDValue Mask = Op.getOperand(Op.getNumOperands() - 2);
- MVT MaskVT = Mask.getSimpleValueType();
- MVT MaskContainerVT =
- ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
- Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
+ SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+ SDValue Mask = Op.getOperand(Op.getNumOperands() - 2);
+ MVT MaskVT = Mask.getSimpleValueType();
+ MVT MaskContainerVT =
+ ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
+ Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
- SDValue IntID = DAG.getTargetConstant(VssegInts[NF - 2], DL, XLenVT);
- SDValue Ptr = Op->getOperand(NF + 2);
+ SDValue IntID = DAG.getTargetConstant(
+ IsStrided ? VsssegInts[NF - 2] : VssegInts[NF - 2], DL, XLenVT);
+ SDValue Ptr = Op->getOperand(NF + 2);
- auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op);
+ auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op);
- SDValue StoredVal = DAG.getUNDEF(VecTupTy);
- for (unsigned i = 0; i < NF; i++)
- StoredVal = DAG.getNode(
- RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
- convertToScalableVector(
- ContainerVT, FixedIntrinsic->getOperand(2 + i), DAG, Subtarget),
- DAG.getTargetConstant(i, DL, MVT::i32));
+ SDValue StoredVal = DAG.getUNDEF(VecTupTy);
+ for (unsigned i = 0; i < NF; i++)
+ StoredVal = DAG.getNode(
+ RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
+ convertToScalableVector(ContainerVT, FixedIntrinsic->getOperand(2 + i),
+ DAG, Subtarget),
+ DAG.getTargetConstant(i, DL, MVT::i32));
+
+ SmallVector<SDValue, 10> Ops = {
+ FixedIntrinsic->getChain(),
+ IntID,
+ StoredVal,
+ Ptr,
+ Mask,
+ VL,
+ DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
+ // Insert the stride operand.
+ if (IsStrided)
+ Ops.insert(std::next(Ops.begin(), 4),
+ Op.getOperand(Op.getNumOperands() - 3));
+
+ return DAG.getMemIntrinsicNode(
+ ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops,
+ FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand());
+}
+
+SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntNo = Op.getConstantOperandVal(1);
+ switch (IntNo) {
+ default:
+ break;
+ case Intrinsic::riscv_seg2_store_mask:
+ case Intrinsic::riscv_seg3_store_mask:
+ case Intrinsic::riscv_seg4_store_mask:
+ case Intrinsic::riscv_seg5_store_mask:
+ case Intrinsic::riscv_seg6_store_mask:
+ case Intrinsic::riscv_seg7_store_mask:
+ case Intrinsic::riscv_seg8_store_mask:
+ case Intrinsic::riscv_sseg2_store_mask:
+ case Intrinsic::riscv_sseg3_store_mask:
+ case Intrinsic::riscv_sseg4_store_mask:
+ case Intrinsic::riscv_sseg5_store_mask:
+ case Intrinsic::riscv_sseg6_store_mask:
+ case Intrinsic::riscv_sseg7_store_mask:
+ case Intrinsic::riscv_sseg8_store_mask:
+ return lowerFixedVectorSegStoreIntrinsics(IntNo, Op, Subtarget, DAG);
- SDValue Ops[] = {
- FixedIntrinsic->getChain(),
- IntID,
- StoredVal,
- Ptr,
- Mask,
- VL,
- DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
-
- return DAG.getMemIntrinsicNode(
- ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops,
- FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand());
- }
case Intrinsic::riscv_sf_vc_xv_se:
return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_XV_SE);
case Intrinsic::riscv_sf_vc_iv_se:
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 5541506..24ebbc3 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -524,16 +524,33 @@ foreach mx = SchedMxListW in {
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+ let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
+ defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ // Latency of vsmul: e8/e16 = 4/4/5/8, e32 = 5/5/5/8, e64 = 7/8/16/32
+ // We use the worst-case until we can split the SEW.
+ defvar VSMulLat = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c;
+ // Latency of vsmul: e8/e16/e32 = 1/2/4/8, e64 = 4/8/16/32
+ // We use the worst-case until we can split the SEW.
+ defvar VSMulOcc = ConstValueUntilLMULThenDoubleBase<"M1", 1, 4, mx>.c;
+ // TODO: change WriteVSMulV/X to be defined with LMULSEWSchedWrites
+ let Latency = VSMulLat, ReleaseAtCycles = [VSMulOcc] in {
+ defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ defvar VSShiftLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+ defvar VSShiftOcc = ConstOneUntilMF2ThenDouble<mx>.c;
+ let Latency = VSShiftLat, ReleaseAtCycles = [VSShiftOcc] in {
+ defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
// 13. Vector Floating-Point Instructions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index fe0f308..b17cf17 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -3042,7 +3042,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
Value *V = LHS;
unsigned MaskElems = Mask.size();
auto *SrcTy = cast<FixedVectorType>(V->getType());
- unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedValue();
+ unsigned VecBitWidth = DL.getTypeSizeInBits(SrcTy);
unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
assert(SrcElemBitWidth && "vector elements must have a bitwidth");
unsigned SrcNumElems = SrcTy->getNumElements();
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 0ddc231..e9bf59c 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -58,14 +58,55 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
}
// Compute alignment from known bits.
+ auto InferFromKnownBits = [&](Instruction &I, Value *PtrOp) {
+ KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT);
+ unsigned TrailZ =
+ std::min(Known.countMinTrailingZeros(), +Value::MaxAlignmentExponent);
+ return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+ };
+
+ // Propagate alignment between loads and stores that originate from the
+ // same base pointer.
+ DenseMap<Value *, Align> BestBasePointerAligns;
+ auto InferFromBasePointer = [&](Value *PtrOp, Align LoadStoreAlign) {
+ APInt OffsetFromBase(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
+ PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase, true);
+ // Derive the base pointer alignment from the load/store alignment
+ // and the offset from the base pointer.
+ Align BasePointerAlign =
+ commonAlignment(LoadStoreAlign, OffsetFromBase.getLimitedValue());
+
+ auto [It, Inserted] =
+ BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign);
+ if (!Inserted) {
+ // If the stored base pointer alignment is better than the
+ // base pointer alignment we derived, we may be able to use it
+ // to improve the load/store alignment. If not, store the
+ // improved base pointer alignment for future iterations.
+ if (It->second > BasePointerAlign) {
+ Align BetterLoadStoreAlign =
+ commonAlignment(It->second, OffsetFromBase.getLimitedValue());
+ return BetterLoadStoreAlign;
+ }
+ It->second = BasePointerAlign;
+ }
+ return LoadStoreAlign;
+ };
+
for (BasicBlock &BB : F) {
+ // We need to reset the map for each block because alignment information
+ // can only be propagated from instruction A to B if A dominates B.
+ // This is because control flow (and exception throwing) could be dependent
+ // on the address (and its alignment) at runtime. Some sort of dominator
+ // tree approach could be better, but doing a simple forward pass through a
+ // single basic block is correct too.
+ BestBasePointerAligns.clear();
+
for (Instruction &I : BB) {
Changed |= tryToImproveAlign(
DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) {
- KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT);
- unsigned TrailZ = std::min(Known.countMinTrailingZeros(),
- +Value::MaxAlignmentExponent);
- return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+ return std::max(InferFromKnownBits(I, PtrOp),
+ InferFromBasePointer(PtrOp, OldAlign));
});
}
}
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index fcdb8a9..c68149b 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -263,6 +263,7 @@ static bool isUniformShape(Value *V) {
case llvm::Instruction::FPExt:
return true;
case llvm::Instruction::AddrSpaceCast:
+ case CastInst::PtrToAddr:
case CastInst::PtrToInt:
case CastInst::IntToPtr:
return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index be00fd6..1ac84ef 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -548,9 +548,6 @@ public:
protected:
friend class LoopVectorizationPlanner;
- /// Returns (and creates if needed) the trip count of the widened loop.
- Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
-
// Create a check to see if the vector loop should be executed
Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
@@ -2272,56 +2269,6 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
return TTI.enableMaskedInterleavedAccessVectorization();
}
-Value *
-InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
- if (VectorTripCount)
- return VectorTripCount;
-
- Value *TC = getTripCount();
- IRBuilder<> Builder(InsertBlock->getTerminator());
-
- Type *Ty = TC->getType();
- // This is where we can make the step a runtime constant.
- Value *Step = createStepForVF(Builder, Ty, VF, UF);
-
- // If the tail is to be folded by masking, round the number of iterations N
- // up to a multiple of Step instead of rounding down. This is done by first
- // adding Step-1 and then rounding down. Note that it's ok if this addition
- // overflows: the vector induction variable will eventually wrap to zero given
- // that it starts at zero and its Step is a power of two; the loop will then
- // exit, with the last early-exit vector comparison also producing all-true.
- // For scalable vectors the VF is not guaranteed to be a power of 2, but this
- // is accounted for in emitIterationCountCheck that adds an overflow check.
- if (Cost->foldTailByMasking()) {
- assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
- "VF*UF must be a power of 2 when folding tail by masking");
- TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
- "n.rnd.up");
- }
-
- // Now we need to generate the expression for the part of the loop that the
- // vectorized body will execute. This is equal to N - (N % Step) if scalar
- // iterations are not required for correctness, or N - Step, otherwise. Step
- // is equal to the vectorization factor (number of SIMD elements) times the
- // unroll factor (number of SIMD instructions).
- Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
-
- // There are cases where we *must* run at least one iteration in the remainder
- // loop. See the cost model for when this can happen. If the step evenly
- // divides the trip count, we set the remainder to be equal to the step. If
- // the step does not evenly divide the trip count, no adjustment is necessary
- // since there will already be scalar iterations. Note that the minimum
- // iterations check ensures that N >= Step.
- if (Cost->requiresScalarEpilogue(VF.isVector())) {
- auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
- R = Builder.CreateSelect(IsZero, Step, R);
- }
-
- VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
-
- return VectorTripCount;
-}
-
void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
// Note: The block with the minimum trip-count check is already connected
// during earlier VPlan construction.
@@ -7354,6 +7301,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// Canonicalize EVL loops after regions are dissolved.
VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH);
+ VPlanTransforms::materializeVectorTripCount(
+ BestVPlan, VectorPH, CM.foldTailByMasking(),
+ CM.requiresScalarEpilogue(BestVF.isVector()));
// Perform the actual loop transformation.
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
@@ -7410,8 +7360,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
- BestVPlan.prepareToExecute(
- ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
+ BestVPlan.prepareToExecute(State);
replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
// Move check blocks to their final position.
@@ -9407,13 +9356,6 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName(Name);
- // If index is the vector trip count, the concrete value will only be set in
- // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
- // TODO: Remove the special case for the vector trip count once it is computed
- // in VPlan and can be used during VPlan simplification.
- assert((DerivedIV != Index ||
- getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
- "IV didn't need transforming?");
State.set(this, DerivedIV, VPLane(0));
}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 39011e7..ec06a21 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12050,7 +12050,8 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
for (auto [V, Op] : zip(VL, Operands.front())) {
auto *I = dyn_cast<Instruction>(Op);
if (!I || !I->hasOneUse()) {
- FMACost += TTI.getInstructionCost(cast<Instruction>(V), CostKind);
+ if (auto *OpI = dyn_cast<Instruction>(V))
+ FMACost += TTI.getInstructionCost(OpI, CostKind);
if (I)
FMACost += TTI.getInstructionCost(I, CostKind);
continue;
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
index f32d57f..e414c12 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
@@ -81,6 +81,7 @@ LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes(
case Instruction::Opcode::FPToUI:
case Instruction::Opcode::FPToSI:
case Instruction::Opcode::FPExt:
+ case Instruction::Opcode::PtrToAddr:
case Instruction::Opcode::PtrToInt:
case Instruction::Opcode::IntToPtr:
case Instruction::Opcode::SIToFP:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 73babcc..a820b524 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -951,15 +951,9 @@ VPlan::~VPlan() {
delete BackedgeTakenCount;
}
-void VPlan::prepareToExecute(Value *VectorTripCountV, VPTransformState &State) {
- if (!VectorTripCount.getUnderlyingValue())
- VectorTripCount.setUnderlyingValue(VectorTripCountV);
- else
- assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV &&
- "VectorTripCount set earlier must much VectorTripCountV");
-
+void VPlan::prepareToExecute(VPTransformState &State) {
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
- Type *TCTy = VectorTripCountV->getType();
+ Type *TCTy = VPTypeAnalysis(*this).inferScalarType(getTripCount());
// FIXME: Model VF * UF computation completely in VPlan.
unsigned UF = getUF();
if (VF.getNumUsers()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c42cdd5..6f098351 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2408,11 +2408,11 @@ public:
// TODO: extend the masked interleaved-group support to reversed access.
assert((!Mask || !IG->isReverse()) &&
"Reversed masked interleave-group not supported.");
- for (unsigned i = 0; i < IG->getFactor(); ++i)
- if (Instruction *I = IG->getMember(i)) {
- if (I->getType()->isVoidTy())
+ for (unsigned I = 0; I < IG->getFactor(); ++I)
+ if (Instruction *Inst = IG->getMember(I)) {
+ if (Inst->getType()->isVoidTy())
continue;
- new VPValue(I, this);
+ new VPValue(Inst, this);
}
for (auto *SV : StoredValues)
@@ -3969,7 +3969,7 @@ public:
}
/// Prepare the plan for execution, setting up the required live-in values.
- void prepareToExecute(Value *VectorTripCount, VPTransformState &State);
+ void prepareToExecute(VPTransformState &State);
/// Generate the IR code for this VPlan.
void execute(VPTransformState *State);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1c8bd6c..34b2abf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3278,6 +3278,67 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
BTC->replaceAllUsesWith(TCMO);
}
+void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
+ VPBasicBlock *VectorPHVPBB,
+ bool TailByMasking,
+ bool RequiresScalarEpilogue) {
+ VPValue &VectorTC = Plan.getVectorTripCount();
+ assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
+ // There's nothing to do if there are no users of the vector trip count or its
+ // IR value has already been set.
+ if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
+ return;
+
+ VPValue *TC = Plan.getTripCount();
+ Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
+ VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
+ VPValue *Step = &Plan.getVFxUF();
+
+ // If the tail is to be folded by masking, round the number of iterations N
+ // up to a multiple of Step instead of rounding down. This is done by first
+ // adding Step-1 and then rounding down. Note that it's ok if this addition
+ // overflows: the vector induction variable will eventually wrap to zero given
+ // that it starts at zero and its Step is a power of two; the loop will then
+ // exit, with the last early-exit vector comparison also producing all-true.
+ // For scalable vectors the VF is not guaranteed to be a power of 2, but this
+ // is accounted for in emitIterationCountCheck that adds an overflow check.
+ if (TailByMasking) {
+ TC = Builder.createNaryOp(
+ Instruction::Add,
+ {TC, Builder.createNaryOp(
+ Instruction::Sub,
+ {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
+ DebugLoc::getCompilerGenerated(), "n.rnd.up");
+ }
+
+ // Now we need to generate the expression for the part of the loop that the
+ // vectorized body will execute. This is equal to N - (N % Step) if scalar
+ // iterations are not required for correctness, or N - Step, otherwise. Step
+ // is equal to the vectorization factor (number of SIMD elements) times the
+ // unroll factor (number of SIMD instructions).
+ VPValue *R =
+ Builder.createNaryOp(Instruction::URem, {TC, Step},
+ DebugLoc::getCompilerGenerated(), "n.mod.vf");
+
+ // There are cases where we *must* run at least one iteration in the remainder
+ // loop. See the cost model for when this can happen. If the step evenly
+ // divides the trip count, we set the remainder to be equal to the step. If
+ // the step does not evenly divide the trip count, no adjustment is necessary
+ // since there will already be scalar iterations. Note that the minimum
+ // iterations check ensures that N >= Step.
+ if (RequiresScalarEpilogue) {
+ assert(!TailByMasking &&
+ "requiring scalar epilogue is not supported with fail folding");
+ VPValue *IsZero = Builder.createICmp(
+ CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
+ R = Builder.createSelect(IsZero, Step, R);
+ }
+
+ VPValue *Res = Builder.createNaryOp(
+ Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
+ VectorTC.replaceAllUsesWith(Res);
+}
+
/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index cc50c75..2afe956 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -256,6 +256,12 @@ struct VPlanTransforms {
unsigned BestUF,
PredicatedScalarEvolution &PSE);
+ /// Materialize vector trip count computations to a set of VPInstructions.
+ static void materializeVectorTripCount(VPlan &Plan,
+ VPBasicBlock *VectorPHVPBB,
+ bool TailByMasking,
+ bool RequiresScalarEpilogue);
+
/// Materialize the backedge-taken count to be computed explicitly using
/// VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan,