diff options
Diffstat (limited to 'llvm/lib')
51 files changed, 898 insertions, 517 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index dd98b62..c14cb9e 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1485,6 +1485,9 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C, switch (Opcode) { default: llvm_unreachable("Missing case"); + case Instruction::PtrToAddr: + // TODO: Add some of the ptrtoint folds here as well. + break; case Instruction::PtrToInt: if (auto *CE = dyn_cast<ConstantExpr>(C)) { Constant *FoldedValue = nullptr; diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp index 329bd35..761c566 100644 --- a/llvm/lib/Analysis/Delinearization.cpp +++ b/llvm/lib/Analysis/Delinearization.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -32,6 +33,11 @@ using namespace llvm; #define DL_NAME "delinearize" #define DEBUG_TYPE DL_NAME +static cl::opt<bool> UseFixedSizeArrayHeuristic( + "delinearize-use-fixed-size-array-heuristic", cl::init(false), cl::Hidden, + cl::desc("When printing analysis, use the heuristic for fixed-size arrays " + "if the default delinearizetion fails.")); + // Return true when S contains at least an undef value. static inline bool containsUndefs(const SCEV *S) { return SCEVExprContains(S, [](const SCEV *S) { @@ -480,6 +486,184 @@ void llvm::delinearize(ScalarEvolution &SE, const SCEV *Expr, }); } +static std::optional<APInt> tryIntoAPInt(const SCEV *S) { + if (const auto *Const = dyn_cast<SCEVConstant>(S)) + return Const->getAPInt(); + return std::nullopt; +} + +/// Collects the absolute values of constant steps for all induction variables. +/// Returns true if we can prove that all step recurrences are constants and \p +/// Expr is divisible by \p ElementSize. Each step recurrence is stored in \p +/// Steps after divided by \p ElementSize. +static bool collectConstantAbsSteps(ScalarEvolution &SE, const SCEV *Expr, + SmallVectorImpl<uint64_t> &Steps, + uint64_t ElementSize) { + // End of recursion. The constant value also must be a multiple of + // ElementSize. + if (const auto *Const = dyn_cast<SCEVConstant>(Expr)) { + const uint64_t Mod = Const->getAPInt().urem(ElementSize); + return Mod == 0; + } + + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Expr); + if (!AR || !AR->isAffine()) + return false; + + const SCEV *Step = AR->getStepRecurrence(SE); + std::optional<APInt> StepAPInt = tryIntoAPInt(Step); + if (!StepAPInt) + return false; + + APInt Q; + uint64_t R; + APInt::udivrem(StepAPInt->abs(), ElementSize, Q, R); + if (R != 0) + return false; + + // Bail out when the step is too large. + std::optional<uint64_t> StepVal = Q.tryZExtValue(); + if (!StepVal) + return false; + + Steps.push_back(*StepVal); + return collectConstantAbsSteps(SE, AR->getStart(), Steps, ElementSize); +} + +bool llvm::findFixedSizeArrayDimensions(ScalarEvolution &SE, const SCEV *Expr, + SmallVectorImpl<uint64_t> &Sizes, + const SCEV *ElementSize) { + if (!ElementSize) + return false; + + std::optional<APInt> ElementSizeAPInt = tryIntoAPInt(ElementSize); + if (!ElementSizeAPInt || *ElementSizeAPInt == 0) + return false; + + std::optional<uint64_t> ElementSizeConst = ElementSizeAPInt->tryZExtValue(); + + // Early exit when ElementSize is not a positive constant. + if (!ElementSizeConst) + return false; + + if (!collectConstantAbsSteps(SE, Expr, Sizes, *ElementSizeConst) || + Sizes.empty()) { + Sizes.clear(); + return false; + } + + // At this point, Sizes contains the absolute step recurrences for all + // induction variables. Each step recurrence must be a multiple of the size of + // the array element. Assuming that the each value represents the size of an + // array for each dimension, attempts to restore the length of each dimension + // by dividing the step recurrence by the next smaller value. For example, if + // we have the following AddRec SCEV: + // + // AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8) + // + // Then Sizes will become [256, 32, 1] after sorted. We don't know the size of + // the outermost dimension, the next dimension will be computed as 256 / 32 = + // 8, and the last dimension will be computed as 32 / 1 = 32. Thus it results + // in like Arr[UnknownSize][8][32] with elements of size 8 bytes, where Arr is + // a base pointer. + // + // TODO: Catch more cases, e.g., when a step recurrence is not divisible by + // the next smaller one, like A[i][3*j]. + llvm::sort(Sizes.rbegin(), Sizes.rend()); + Sizes.erase(llvm::unique(Sizes), Sizes.end()); + + // The last element in Sizes should be ElementSize. At this point, all values + // in Sizes are assumed to be divided by ElementSize, so replace it with 1. + assert(Sizes.back() != 0 && "Unexpected zero size in Sizes."); + Sizes.back() = 1; + + for (unsigned I = 0; I + 1 < Sizes.size(); I++) { + uint64_t PrevSize = Sizes[I + 1]; + if (Sizes[I] % PrevSize) { + Sizes.clear(); + return false; + } + Sizes[I] /= PrevSize; + } + + // Finally, the last element in Sizes should be ElementSize. + Sizes.back() = *ElementSizeConst; + return true; +} + +/// Splits the SCEV into two vectors of SCEVs representing the subscripts and +/// sizes of an array access, assuming that the array is a fixed size array. +/// +/// E.g., if we have the code like as follows: +/// +/// double A[42][8][32]; +/// for i +/// for j +/// for k +/// use A[i][j][k] +/// +/// The access function will be represented as an AddRec SCEV like: +/// +/// AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8) +/// +/// Then findFixedSizeArrayDimensions infers the size of each dimension of the +/// array based on the fact that the value of the step recurrence is a multiple +/// of the size of the corresponding array element. In the above example, it +/// results in the following: +/// +/// CHECK: ArrayDecl[UnknownSize][8][32] with elements of 8 bytes. +/// +/// Finally each subscript will be computed as follows: +/// +/// CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>] +/// +/// Note that this function doesn't check the range of possible values for each +/// subscript, so the caller should perform additional boundary checks if +/// necessary. +/// +/// Also note that this function doesn't guarantee that the original array size +/// is restored "correctly". For example, in the following case: +/// +/// double A[42][4][64]; +/// double B[42][8][32]; +/// for i +/// for j +/// for k +/// use A[i][j][k] +/// use B[i][2*j][k] +/// +/// The access function for both accesses will be the same: +/// +/// AddRec: {{{0,+,2048}<%for.i>,+,512}<%for.j>,+,8}<%for.k> (ElementSize=8) +/// +/// The array sizes for both A and B will be computed as +/// ArrayDecl[UnknownSize][4][64], which matches for A, but not for B. +/// +/// TODO: At the moment, this function can handle only simple cases. For +/// example, we cannot handle a case where a step recurrence is not divisible +/// by the next smaller step recurrence, e.g., A[i][3*j]. +bool llvm::delinearizeFixedSizeArray(ScalarEvolution &SE, const SCEV *Expr, + SmallVectorImpl<const SCEV *> &Subscripts, + SmallVectorImpl<const SCEV *> &Sizes, + const SCEV *ElementSize) { + + // First step: find the fixed array size. + SmallVector<uint64_t, 4> ConstSizes; + if (!findFixedSizeArrayDimensions(SE, Expr, ConstSizes, ElementSize)) { + Sizes.clear(); + return false; + } + + // Convert the constant size to SCEV. + for (uint64_t Size : ConstSizes) + Sizes.push_back(SE.getConstant(Expr->getType(), Size)); + + // Second step: compute the access functions for each subscript. + computeAccessFunctions(SE, Expr, Subscripts, Sizes); + + return !Subscripts.empty(); +} + bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE, const GetElementPtrInst *GEP, SmallVectorImpl<const SCEV *> &Subscripts, @@ -586,9 +770,21 @@ void printDelinearization(raw_ostream &O, Function *F, LoopInfo *LI, O << "AccessFunction: " << *AccessFn << "\n"; SmallVector<const SCEV *, 3> Subscripts, Sizes; + + auto IsDelinearizationFailed = [&]() { + return Subscripts.size() == 0 || Sizes.size() == 0 || + Subscripts.size() != Sizes.size(); + }; + delinearize(*SE, AccessFn, Subscripts, Sizes, SE->getElementSize(&Inst)); - if (Subscripts.size() == 0 || Sizes.size() == 0 || - Subscripts.size() != Sizes.size()) { + if (UseFixedSizeArrayHeuristic && IsDelinearizationFailed()) { + Subscripts.clear(); + Sizes.clear(); + delinearizeFixedSizeArray(*SE, AccessFn, Subscripts, Sizes, + SE->getElementSize(&Inst)); + } + + if (IsDelinearizationFailed()) { O << "failed to delinearize\n"; continue; } diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 1e70228..b0e4b00 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -9147,7 +9147,8 @@ static bool matchTwoInputRecurrence(const PHINode *PN, InstTy *&Inst, return false; for (unsigned I = 0; I != 2; ++I) { - if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I))) { + if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I)); + Operation && Operation->getNumOperands() >= 2) { Value *LHS = Operation->getOperand(0); Value *RHS = Operation->getOperand(1); if (LHS != PN && RHS != PN) diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 520c6a0..3d5bd61 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -928,6 +928,7 @@ lltok::Kind LLLexer::LexIdentifier() { INSTKEYWORD(fptoui, FPToUI); INSTKEYWORD(fptosi, FPToSI); INSTKEYWORD(inttoptr, IntToPtr); + INSTKEYWORD(ptrtoaddr, PtrToAddr); INSTKEYWORD(ptrtoint, PtrToInt); INSTKEYWORD(bitcast, BitCast); INSTKEYWORD(addrspacecast, AddrSpaceCast); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 13bef1f..1bc2906 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -4273,6 +4273,7 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) { case lltok::kw_bitcast: case lltok::kw_addrspacecast: case lltok::kw_inttoptr: + case lltok::kw_ptrtoaddr: case lltok::kw_ptrtoint: { unsigned Opc = Lex.getUIntVal(); Type *DestTy = nullptr; @@ -7310,6 +7311,7 @@ int LLParser::parseInstruction(Instruction *&Inst, BasicBlock *BB, case lltok::kw_fptoui: case lltok::kw_fptosi: case lltok::kw_inttoptr: + case lltok::kw_ptrtoaddr: case lltok::kw_ptrtoint: return parseCast(Inst, PFS, KeywordVal); case lltok::kw_fptrunc: diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 290d873..22a0d0f 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -1283,6 +1283,7 @@ static int getDecodedCastOpcode(unsigned Val) { case bitc::CAST_SITOFP : return Instruction::SIToFP; case bitc::CAST_FPTRUNC : return Instruction::FPTrunc; case bitc::CAST_FPEXT : return Instruction::FPExt; + case bitc::CAST_PTRTOADDR: return Instruction::PtrToAddr; case bitc::CAST_PTRTOINT: return Instruction::PtrToInt; case bitc::CAST_INTTOPTR: return Instruction::IntToPtr; case bitc::CAST_BITCAST : return Instruction::BitCast; diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 05680fa..a3f8254 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -647,6 +647,7 @@ static unsigned getEncodedCastOpcode(unsigned Opcode) { case Instruction::SIToFP : return bitc::CAST_SITOFP; case Instruction::FPTrunc : return bitc::CAST_FPTRUNC; case Instruction::FPExt : return bitc::CAST_FPEXT; + case Instruction::PtrToAddr: return bitc::CAST_PTRTOADDR; case Instruction::PtrToInt: return bitc::CAST_PTRTOINT; case Instruction::IntToPtr: return bitc::CAST_INTTOPTR; case Instruction::BitCast : return bitc::CAST_BITCAST; diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index c72b6e8..23a3543 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -3657,6 +3657,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV, break; // Error } + case Instruction::PtrToAddr: case Instruction::PtrToInt: { const DataLayout &DL = getDataLayout(); diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp index ca51b67..5f37890 100644 --- a/llvm/lib/CodeGen/RegisterPressure.cpp +++ b/llvm/lib/CodeGen/RegisterPressure.cpp @@ -1001,7 +1001,7 @@ static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec, ++CritIdx; if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == i) { - int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].getUnitInc(); + int PDiff = (int)PNew - CriticalPSets[CritIdx].getUnitInc(); if (PDiff > 0) { Delta.CriticalMax = PressureChange(i); Delta.CriticalMax.setUnitInc(PDiff); @@ -1191,7 +1191,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff, ++CritIdx; if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) { - int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc(); + int CritInc = (int)MNew - CriticalPSets[CritIdx].getUnitInc(); if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) { Delta.CriticalMax = PressureChange(PSetID); Delta.CriticalMax.setUnitInc(CritInc); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5f1e38a..17703f5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16342,6 +16342,38 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { DAG, DL); } break; + case ISD::ABDU: + case ISD::ABDS: + // (trunc (abdu/abds a, b)) → (abdu/abds (trunc a), (trunc b)) + if (!LegalOperations || N0.hasOneUse()) { + EVT SrcVT = N0.getValueType(); + EVT TruncVT = VT; + unsigned SrcBits = SrcVT.getScalarSizeInBits(); + unsigned TruncBits = TruncVT.getScalarSizeInBits(); + unsigned NeededBits = SrcBits - TruncBits; + + SDValue A = N0.getOperand(0); + SDValue B = N0.getOperand(1); + bool CanFold = false; + + if (N0.getOpcode() == ISD::ABDU) { + KnownBits KnownA = DAG.computeKnownBits(A); + KnownBits KnownB = DAG.computeKnownBits(B); + CanFold = KnownA.countMinLeadingZeros() >= NeededBits && + KnownB.countMinLeadingZeros() >= NeededBits; + } else { + unsigned SignBitsA = DAG.ComputeNumSignBits(A); + unsigned SignBitsB = DAG.ComputeNumSignBits(B); + CanFold = SignBitsA > NeededBits && SignBitsB > NeededBits; + } + + if (CanFold && TLI.isOperationLegal(N0.getOpcode(), VT)) { + SDValue NewA = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, A); + SDValue NewB = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, B); + return DAG.getNode(N0.getOpcode(), DL, TruncVT, NewA, NewB); + } + } + break; } return SDValue(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 0d1e954..48ab797 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3977,6 +3977,11 @@ void SelectionDAGBuilder::visitSIToFP(const User &I) { setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N)); } +void SelectionDAGBuilder::visitPtrToAddr(const User &I) { + // FIXME: this is not correct for pointers with addr width != pointer width + visitPtrToInt(I); +} + void SelectionDAGBuilder::visitPtrToInt(const User &I) { // What to do depends on the size of the integer and the size of the pointer. // We can either truncate, zero extend, or no-op, accordingly. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index c251755..e0835e6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -574,6 +574,7 @@ private: void visitFPToSI(const User &I); void visitUIToFP(const User &I); void visitSIToFP(const User &I); + void visitPtrToAddr(const User &I); void visitPtrToInt(const User &I); void visitIntToPtr(const User &I); void visitBitCast(const User &I); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index bf4c9f9..d80a229 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1893,6 +1893,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const { case SIToFP: return ISD::SINT_TO_FP; case FPTrunc: return ISD::FP_ROUND; case FPExt: return ISD::FP_EXTEND; + case PtrToAddr: return ISD::BITCAST; case PtrToInt: return ISD::BITCAST; case IntToPtr: return ISD::BITCAST; case BitCast: return ISD::BITCAST; diff --git a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp index 9d84aa8..72308a3d 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp @@ -29,7 +29,7 @@ bool verifyRegisterValue(uint32_t RegisterValue) { // This Range is reserverved, therefore invalid, according to the spec // https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#all-the-values-should-be-legal bool verifyRegisterSpace(uint32_t RegisterSpace) { - return !(RegisterSpace >= 0xFFFFFFF0 && RegisterSpace <= 0xFFFFFFFF); + return !(RegisterSpace >= 0xFFFFFFF0); } bool verifyRootDescriptorFlag(uint32_t Version, uint32_t FlagsVal) { diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 35f00ae..b91fd70 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1314,12 +1314,12 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if ((Name.starts_with("lifetime.start") || Name.starts_with("lifetime.end")) && F->arg_size() == 2) { + Intrinsic::ID IID = Name.starts_with("lifetime.start") + ? Intrinsic::lifetime_start + : Intrinsic::lifetime_end; rename(F); - NewFn = Intrinsic::getOrInsertDeclaration( - F->getParent(), - Name.starts_with("lifetime.start") ? Intrinsic::lifetime_start - : Intrinsic::lifetime_end, - F->getArg(0)->getType()); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID, + F->getArg(0)->getType()); return true; } break; diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp index d4ad21e..6b202ba 100644 --- a/llvm/lib/IR/ConstantFold.cpp +++ b/llvm/lib/IR/ConstantFold.cpp @@ -254,6 +254,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V, return FoldBitCast(V, DestTy); case Instruction::AddrSpaceCast: case Instruction::IntToPtr: + case Instruction::PtrToAddr: case Instruction::PtrToInt: return nullptr; } diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp index e09c139..2fcdbcc6 100644 --- a/llvm/lib/IR/ConstantRange.cpp +++ b/llvm/lib/IR/ConstantRange.cpp @@ -829,6 +829,7 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp, case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::IntToPtr: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::AddrSpaceCast: // Conservatively return getFull set. diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index a3c725b..c7e3113a 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -1567,6 +1567,7 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty, case Instruction::SIToFP: case Instruction::FPToUI: case Instruction::FPToSI: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: @@ -2223,6 +2224,8 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty, llvm_unreachable("Invalid cast opcode"); case Instruction::Trunc: return getTrunc(C, Ty, OnlyIfReduced); + case Instruction::PtrToAddr: + return getPtrToAddr(C, Ty, OnlyIfReduced); case Instruction::PtrToInt: return getPtrToInt(C, Ty, OnlyIfReduced); case Instruction::IntToPtr: @@ -2280,6 +2283,20 @@ Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) { return getFoldedCast(Instruction::Trunc, C, Ty, OnlyIfReduced); } +Constant *ConstantExpr::getPtrToAddr(Constant *C, Type *DstTy, + bool OnlyIfReduced) { + assert(C->getType()->isPtrOrPtrVectorTy() && + "PtrToAddr source must be pointer or pointer vector"); + assert(DstTy->isIntOrIntVectorTy() && + "PtrToAddr destination must be integer or integer vector"); + assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy)); + if (isa<VectorType>(C->getType())) + assert(cast<VectorType>(C->getType())->getElementCount() == + cast<VectorType>(DstTy)->getElementCount() && + "Invalid cast between a different number of vector elements"); + return getFoldedCast(Instruction::PtrToAddr, C, DstTy, OnlyIfReduced); +} + Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy, bool OnlyIfReduced) { assert(C->getType()->isPtrOrPtrVectorTy() && @@ -2435,6 +2452,7 @@ bool ConstantExpr::isDesirableCastOp(unsigned Opcode) { case Instruction::FPToSI: return false; case Instruction::Trunc: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: @@ -2457,6 +2475,7 @@ bool ConstantExpr::isSupportedCastOp(unsigned Opcode) { case Instruction::FPToSI: return false; case Instruction::Trunc: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: @@ -3401,6 +3420,7 @@ Instruction *ConstantExpr::getAsInstruction() const { switch (getOpcode()) { case Instruction::Trunc: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index f1d4549..96065ed 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -57,15 +57,9 @@ DebugVariable::DebugVariable(const DbgVariableRecord *DVR) DILocation::DILocation(LLVMContext &C, StorageType Storage, unsigned Line, unsigned Column, uint64_t AtomGroup, uint8_t AtomRank, ArrayRef<Metadata *> MDs, bool ImplicitCode) - : MDNode(C, DILocationKind, Storage, MDs) -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS - , - AtomGroup(AtomGroup), AtomRank(AtomRank) -#endif -{ -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS + : MDNode(C, DILocationKind, Storage, MDs), AtomGroup(AtomGroup), + AtomRank(AtomRank) { assert(AtomRank <= 7 && "AtomRank number should fit in 3 bits"); -#endif if (AtomGroup) C.updateDILocationAtomGroupWaterline(AtomGroup + 1); diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp index 7b799c7..11d33e2 100644 --- a/llvm/lib/IR/Globals.cpp +++ b/llvm/lib/IR/Globals.cpp @@ -404,6 +404,7 @@ findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases, return findBaseObject(CE->getOperand(0), Aliases, Op); } case Instruction::IntToPtr: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::BitCast: case Instruction::GetElementPtr: diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index b7cd12a..4540268 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -817,6 +817,7 @@ const char *Instruction::getOpcodeName(unsigned OpCode) { case UIToFP: return "uitofp"; case SIToFP: return "sitofp"; case IntToPtr: return "inttoptr"; + case PtrToAddr: return "ptrtoaddr"; case PtrToInt: return "ptrtoint"; case BitCast: return "bitcast"; case AddrSpaceCast: return "addrspacecast"; diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index b896382..a1751c0 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -2798,6 +2798,7 @@ bool CastInst::isNoopCast(Instruction::CastOps Opcode, return false; case Instruction::BitCast: return true; // BitCast never modifies bits. + case Instruction::PtrToAddr: case Instruction::PtrToInt: return DL.getIntPtrType(SrcTy)->getScalarSizeInBits() == DestTy->getScalarSizeInBits(); @@ -2855,26 +2856,29 @@ unsigned CastInst::isEliminableCastPair( // same reason. const unsigned numCastOps = Instruction::CastOpsEnd - Instruction::CastOpsBegin; + // clang-format off static const uint8_t CastResults[numCastOps][numCastOps] = { - // T F F U S F F P I B A -+ - // R Z S P P I I T P 2 N T S | - // U E E 2 2 2 2 R E I T C C +- secondOp - // N X X U S F F N X N 2 V V | - // C T T I I P P C T T P T T -+ - { 1, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // Trunc -+ - { 8, 1, 9,99,99, 2,17,99,99,99, 2, 3, 0}, // ZExt | - { 8, 0, 1,99,99, 0, 2,99,99,99, 0, 3, 0}, // SExt | - { 0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToUI | - { 0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToSI | - { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // UIToFP +- firstOp - { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // SIToFP | - { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // FPTrunc | - { 99,99,99, 2, 2,99,99, 8, 2,99,99, 4, 0}, // FPExt | - { 1, 0, 0,99,99, 0, 0,99,99,99, 7, 3, 0}, // PtrToInt | - { 99,99,99,99,99,99,99,99,99,11,99,15, 0}, // IntToPtr | - { 5, 5, 5, 0, 0, 5, 5, 0, 0,16, 5, 1,14}, // BitCast | - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+ + // T F F U S F F P P I B A -+ + // R Z S P P I I T P 2 2 N T S | + // U E E 2 2 2 2 R E I A T C C +- secondOp + // N X X U S F F N X N D 2 V V | + // C T T I I P P C T T R P T T -+ + { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // Trunc -+ + { 8, 1, 9,99,99, 2,17,99,99,99,99, 2, 3, 0}, // ZExt | + { 8, 0, 1,99,99, 0, 2,99,99,99,99, 0, 3, 0}, // SExt | + { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToUI | + { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToSI | + { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // UIToFP +- firstOp + { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // SIToFP | + { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // FPTrunc | + { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt | + { 1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt | + { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr | + { 99,99,99,99,99,99,99,99,99,11,99,99,15, 0}, // IntToPtr | + { 5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast | + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+ }; + // clang-format on // TODO: This logic could be encoded into the table above and handled in the // switch below. @@ -3046,6 +3050,7 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty, case SIToFP: return new SIToFPInst (S, Ty, Name, InsertBefore); case FPToUI: return new FPToUIInst (S, Ty, Name, InsertBefore); case FPToSI: return new FPToSIInst (S, Ty, Name, InsertBefore); + case PtrToAddr: return new PtrToAddrInst (S, Ty, Name, InsertBefore); case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertBefore); case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertBefore); case BitCast: @@ -3347,6 +3352,7 @@ CastInst::castIsValid(Instruction::CastOps op, Type *SrcTy, Type *DstTy) { case Instruction::FPToSI: return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy() && SrcEC == DstEC; + case Instruction::PtrToAddr: case Instruction::PtrToInt: if (SrcEC != DstEC) return false; @@ -3460,6 +3466,12 @@ PtrToIntInst::PtrToIntInst(Value *S, Type *Ty, const Twine &Name, assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt"); } +PtrToAddrInst::PtrToAddrInst(Value *S, Type *Ty, const Twine &Name, + InsertPosition InsertBefore) + : CastInst(Ty, PtrToAddr, S, Name, InsertBefore) { + assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToAddr"); +} + IntToPtrInst::IntToPtrInst(Value *S, Type *Ty, const Twine &Name, InsertPosition InsertBefore) : CastInst(Ty, IntToPtr, S, Name, InsertBefore) { @@ -4427,6 +4439,10 @@ PtrToIntInst *PtrToIntInst::cloneImpl() const { return new PtrToIntInst(getOperand(0), getType()); } +PtrToAddrInst *PtrToAddrInst::cloneImpl() const { + return new PtrToAddrInst(getOperand(0), getType()); +} + IntToPtrInst *IntToPtrInst::cloneImpl() const { return new IntToPtrInst(getOperand(0), getType()); } diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index aa2a60e..e03f993 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -312,10 +312,8 @@ template <> struct MDNodeKeyImpl<MDTuple> : MDNodeOpsKey { template <> struct MDNodeKeyImpl<DILocation> { Metadata *Scope; Metadata *InlinedAt; -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS uint64_t AtomGroup : 61; uint64_t AtomRank : 3; -#endif unsigned Line; uint16_t Column; bool ImplicitCode; @@ -323,36 +321,24 @@ template <> struct MDNodeKeyImpl<DILocation> { MDNodeKeyImpl(unsigned Line, uint16_t Column, Metadata *Scope, Metadata *InlinedAt, bool ImplicitCode, uint64_t AtomGroup, uint8_t AtomRank) - : Scope(Scope), InlinedAt(InlinedAt), -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS - AtomGroup(AtomGroup), AtomRank(AtomRank), -#endif - Line(Line), Column(Column), ImplicitCode(ImplicitCode) { - } + : Scope(Scope), InlinedAt(InlinedAt), AtomGroup(AtomGroup), + AtomRank(AtomRank), Line(Line), Column(Column), + ImplicitCode(ImplicitCode) {} MDNodeKeyImpl(const DILocation *L) : Scope(L->getRawScope()), InlinedAt(L->getRawInlinedAt()), -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS AtomGroup(L->getAtomGroup()), AtomRank(L->getAtomRank()), -#endif Line(L->getLine()), Column(L->getColumn()), - ImplicitCode(L->isImplicitCode()) { - } + ImplicitCode(L->isImplicitCode()) {} bool isKeyOf(const DILocation *RHS) const { return Line == RHS->getLine() && Column == RHS->getColumn() && Scope == RHS->getRawScope() && InlinedAt == RHS->getRawInlinedAt() && - ImplicitCode == RHS->isImplicitCode() -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS - && AtomGroup == RHS->getAtomGroup() && - AtomRank == RHS->getAtomRank(); -#else - ; -#endif + ImplicitCode == RHS->isImplicitCode() && + AtomGroup == RHS->getAtomGroup() && AtomRank == RHS->getAtomRank(); } unsigned getHashValue() const { -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS // Hashing AtomGroup and AtomRank substantially impacts performance whether // Key Instructions is enabled or not. We can't detect whether it's enabled // here cheaply; avoiding hashing zero values is a good approximation. This @@ -363,7 +349,6 @@ template <> struct MDNodeKeyImpl<DILocation> { if (AtomGroup || AtomRank) return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode, AtomGroup, (uint8_t)AtomRank); -#endif return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode); } }; diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 129ca4a..5928c89 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -747,34 +747,28 @@ const Value *Value::stripAndAccumulateConstantOffsets( // means when we construct GEPOffset, we need to use the size // of GEP's pointer type rather than the size of the original // pointer type. - unsigned CurBitWidth = DL.getIndexTypeSizeInBits(V->getType()); - if (CurBitWidth == BitWidth) { - if (!GEP->accumulateConstantOffset(DL, Offset, ExternalAnalysis)) - return V; - } else { - APInt GEPOffset(CurBitWidth, 0); - if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis)) - return V; + APInt GEPOffset(DL.getIndexTypeSizeInBits(V->getType()), 0); + if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis)) + return V; - // Stop traversal if the pointer offset wouldn't fit in the bit-width - // provided by the Offset argument. This can happen due to AddrSpaceCast - // stripping. - if (GEPOffset.getSignificantBits() > BitWidth) - return V; + // Stop traversal if the pointer offset wouldn't fit in the bit-width + // provided by the Offset argument. This can happen due to AddrSpaceCast + // stripping. + if (GEPOffset.getSignificantBits() > BitWidth) + return V; - // External Analysis can return a result higher/lower than the value - // represents. We need to detect overflow/underflow. - APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth); - if (!ExternalAnalysis) { - Offset += GEPOffsetST; - } else { - bool Overflow = false; - APInt OldOffset = Offset; - Offset = Offset.sadd_ov(GEPOffsetST, Overflow); - if (Overflow) { - Offset = OldOffset; - return V; - } + // External Analysis can return a result higher/lower than the value + // represents. We need to detect overflow/underflow. + APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth); + if (!ExternalAnalysis) { + Offset += GEPOffsetST; + } else { + bool Overflow = false; + APInt OldOffset = Offset; + Offset = Offset.sadd_ov(GEPOffsetST, Overflow); + if (Overflow) { + Offset = OldOffset; + return V; } } V = GEP->getPointerOperand(); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index f5dcb5e..1d3c379 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -566,6 +566,8 @@ private: void visitUIToFPInst(UIToFPInst &I); void visitSIToFPInst(SIToFPInst &I); void visitIntToPtrInst(IntToPtrInst &I); + void checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V); + void visitPtrToAddrInst(PtrToAddrInst &I); void visitPtrToIntInst(PtrToIntInst &I); void visitBitCastInst(BitCastInst &I); void visitAddrSpaceCastInst(AddrSpaceCastInst &I); @@ -834,6 +836,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) { &GV); Check(GV.getInitializer()->getType()->isSized(), "Global variable initializer must be sized", &GV); + visitConstantExprsRecursively(GV.getInitializer()); // If the global has common linkage, it must have a zero initializer and // cannot be constant. if (GV.hasCommonLinkage()) { @@ -2610,6 +2613,8 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) { Check(CastInst::castIsValid(Instruction::BitCast, CE->getOperand(0), CE->getType()), "Invalid bitcast", CE); + else if (CE->getOpcode() == Instruction::PtrToAddr) + checkPtrToAddr(CE->getOperand(0)->getType(), CE->getType(), *CE); } void Verifier::visitConstantPtrAuth(const ConstantPtrAuth *CPA) { @@ -3532,6 +3537,28 @@ void Verifier::visitFPToSIInst(FPToSIInst &I) { visitInstruction(I); } +void Verifier::checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V) { + Check(SrcTy->isPtrOrPtrVectorTy(), "PtrToAddr source must be pointer", V); + Check(DestTy->isIntOrIntVectorTy(), "PtrToAddr result must be integral", V); + Check(SrcTy->isVectorTy() == DestTy->isVectorTy(), "PtrToAddr type mismatch", + V); + + if (SrcTy->isVectorTy()) { + auto *VSrc = cast<VectorType>(SrcTy); + auto *VDest = cast<VectorType>(DestTy); + Check(VSrc->getElementCount() == VDest->getElementCount(), + "PtrToAddr vector length mismatch", V); + } + + Type *AddrTy = DL.getAddressType(SrcTy); + Check(AddrTy == DestTy, "PtrToAddr result must be address width", V); +} + +void Verifier::visitPtrToAddrInst(PtrToAddrInst &I) { + checkPtrToAddr(I.getOperand(0)->getType(), I.getType(), I); + visitInstruction(I); +} + void Verifier::visitPtrToIntInst(PtrToIntInst &I) { // Get the source and destination types Type *SrcTy = I.getOperand(0)->getType(); @@ -3547,7 +3574,7 @@ void Verifier::visitPtrToIntInst(PtrToIntInst &I) { auto *VSrc = cast<VectorType>(SrcTy); auto *VDest = cast<VectorType>(DestTy); Check(VSrc->getElementCount() == VDest->getElementCount(), - "PtrToInt Vector width mismatch", &I); + "PtrToInt Vector length mismatch", &I); } visitInstruction(I); @@ -3567,7 +3594,7 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) { auto *VSrc = cast<VectorType>(SrcTy); auto *VDest = cast<VectorType>(DestTy); Check(VSrc->getElementCount() == VDest->getElementCount(), - "IntToPtr Vector width mismatch", &I); + "IntToPtr Vector length mismatch", &I); } visitInstruction(I); } diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index 7ca26aa..df807fc 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -331,61 +331,34 @@ void InstrProfWriter::addDataAccessProfData( DataAccessProfileData = std::move(DataAccessProfDataIn); } -void InstrProfWriter::addTemporalProfileTrace(TemporalProfTraceTy Trace) { - assert(Trace.FunctionNameRefs.size() <= MaxTemporalProfTraceLength); - assert(!Trace.FunctionNameRefs.empty()); - if (TemporalProfTraceStreamSize < TemporalProfTraceReservoirSize) { - // Simply append the trace if we have not yet hit our reservoir size limit. - TemporalProfTraces.push_back(std::move(Trace)); - } else { - // Otherwise, replace a random trace in the stream. - std::uniform_int_distribution<uint64_t> Distribution( - 0, TemporalProfTraceStreamSize); - uint64_t RandomIndex = Distribution(RNG); - if (RandomIndex < TemporalProfTraces.size()) - TemporalProfTraces[RandomIndex] = std::move(Trace); - } - ++TemporalProfTraceStreamSize; -} - void InstrProfWriter::addTemporalProfileTraces( SmallVectorImpl<TemporalProfTraceTy> &SrcTraces, uint64_t SrcStreamSize) { + if (TemporalProfTraces.size() > TemporalProfTraceReservoirSize) + TemporalProfTraces.truncate(TemporalProfTraceReservoirSize); for (auto &Trace : SrcTraces) if (Trace.FunctionNameRefs.size() > MaxTemporalProfTraceLength) Trace.FunctionNameRefs.resize(MaxTemporalProfTraceLength); llvm::erase_if(SrcTraces, [](auto &T) { return T.FunctionNameRefs.empty(); }); - // Assume that the source has the same reservoir size as the destination to - // avoid needing to record it in the indexed profile format. - bool IsDestSampled = - (TemporalProfTraceStreamSize > TemporalProfTraceReservoirSize); - bool IsSrcSampled = (SrcStreamSize > TemporalProfTraceReservoirSize); - if (!IsDestSampled && IsSrcSampled) { - // If one of the traces are sampled, ensure that it belongs to Dest. - std::swap(TemporalProfTraces, SrcTraces); - std::swap(TemporalProfTraceStreamSize, SrcStreamSize); - std::swap(IsDestSampled, IsSrcSampled); - } - if (!IsSrcSampled) { - // If the source stream is not sampled, we add each source trace normally. - for (auto &Trace : SrcTraces) - addTemporalProfileTrace(std::move(Trace)); + // If there are no source traces, it is probably because + // --temporal-profile-max-trace-length=0 was set to deliberately remove all + // traces. In that case, we do not want to increase the stream size + if (SrcTraces.empty()) return; - } - // Otherwise, we find the traces that would have been removed if we added - // the whole source stream. - SmallSetVector<uint64_t, 8> IndicesToReplace; - for (uint64_t I = 0; I < SrcStreamSize; I++) { - std::uniform_int_distribution<uint64_t> Distribution( - 0, TemporalProfTraceStreamSize); + // Add traces until our reservoir is full or we run out of source traces + auto SrcTraceIt = SrcTraces.begin(); + while (TemporalProfTraces.size() < TemporalProfTraceReservoirSize && + SrcTraceIt < SrcTraces.end()) + TemporalProfTraces.push_back(*SrcTraceIt++); + // Our reservoir is full, we need to sample the source stream + llvm::shuffle(SrcTraceIt, SrcTraces.end(), RNG); + for (uint64_t I = TemporalProfTraces.size(); + I < SrcStreamSize && SrcTraceIt < SrcTraces.end(); I++) { + std::uniform_int_distribution<uint64_t> Distribution(0, I); uint64_t RandomIndex = Distribution(RNG); if (RandomIndex < TemporalProfTraces.size()) - IndicesToReplace.insert(RandomIndex); - ++TemporalProfTraceStreamSize; + TemporalProfTraces[RandomIndex] = *SrcTraceIt++; } - // Then we insert a random sample of the source traces. - llvm::shuffle(SrcTraces.begin(), SrcTraces.end(), RNG); - for (const auto &[Index, Trace] : llvm::zip(IndicesToReplace, SrcTraces)) - TemporalProfTraces[Index] = std::move(Trace); + TemporalProfTraceStreamSize += SrcStreamSize; } void InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW, diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp index fe34037..70ac68a 100644 --- a/llvm/lib/SandboxIR/Context.cpp +++ b/llvm/lib/SandboxIR/Context.cpp @@ -256,6 +256,7 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { case llvm::Instruction::FPToUI: case llvm::Instruction::FPToSI: case llvm::Instruction::FPExt: + case llvm::Instruction::PtrToAddr: case llvm::Instruction::PtrToInt: case llvm::Instruction::IntToPtr: case llvm::Instruction::SIToFP: diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp index 956047c..1a81d18 100644 --- a/llvm/lib/SandboxIR/Instruction.cpp +++ b/llvm/lib/SandboxIR/Instruction.cpp @@ -1007,6 +1007,9 @@ static llvm::Instruction::CastOps getLLVMCastOp(Instruction::Opcode Opc) { return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPToSI); case Instruction::Opcode::FPExt: return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPExt); + case Instruction::Opcode::PtrToAddr: + return static_cast<llvm::Instruction::CastOps>( + llvm::Instruction::PtrToAddr); case Instruction::Opcode::PtrToInt: return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::PtrToInt); case Instruction::Opcode::IntToPtr: diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 9f05add..5c94aeb 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -554,7 +554,17 @@ static bool isUnpackedVectorVT(EVT VecVT) { VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; } -static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { +static InstructionCost getHistogramCost(const AArch64Subtarget *ST, + const IntrinsicCostAttributes &ICA) { + // We need to know at least the number of elements in the vector of buckets + // and the size of each element to update. + if (ICA.getArgTypes().size() < 2) + return InstructionCost::getInvalid(); + + // Only interested in costing for the hardware instruction from SVE2. + if (!ST->hasSVE2()) + return InstructionCost::getInvalid(); + Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements unsigned TotalHistCnts = 1; @@ -579,9 +589,11 @@ static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize; TotalHistCnts = EC / NaturalVectorWidth; + + return InstructionCost(BaseHistCntCost * TotalHistCnts); } - return InstructionCost(BaseHistCntCost * TotalHistCnts); + return InstructionCost::getInvalid(); } InstructionCost @@ -597,10 +609,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return InstructionCost::getInvalid(); switch (ICA.getID()) { - case Intrinsic::experimental_vector_histogram_add: - if (!ST->hasSVE2()) - return InstructionCost::getInvalid(); - return getHistogramCost(ICA); + case Intrinsic::experimental_vector_histogram_add: { + InstructionCost HistCost = getHistogramCost(ST, ICA); + // If the cost isn't valid, we may still be able to scalarize + if (HistCost.isValid()) + return HistCost; + break; + } case Intrinsic::umin: case Intrinsic::umax: case Intrinsic::smin: @@ -3975,6 +3990,27 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead( return DemandedElts.popcount() * (Insert + Extract) * VecInstCost; } +std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost( + Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, + TTI::OperandValueInfo Op2Info, bool IncludeTrunc, + std::function<InstructionCost(Type *)> InstCost) const { + if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy()) + return std::nullopt; + if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16()) + return std::nullopt; + + Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext())); + InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty, + TTI::CastContextHint::None, CostKind); + if (!Op1Info.isConstant() && !Op2Info.isConstant()) + Cost *= 2; + Cost += InstCost(PromotedTy); + if (IncludeTrunc) + Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy, + TTI::CastContextHint::None, CostKind); + return Cost; +} + InstructionCost AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, @@ -3997,6 +4033,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); + // Increase the cost for half and bfloat types if not architecturally + // supported. + if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL || + ISD == ISD::FDIV || ISD == ISD::FREM) + if (auto PromotedCost = getFP16BF16PromoteCost( + Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true, + [&](Type *PromotedTy) { + return getArithmeticInstrCost(Opcode, PromotedTy, CostKind, + Op1Info, Op2Info); + })) + return *PromotedCost; + switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, @@ -4265,11 +4313,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( [[fallthrough]]; case ISD::FADD: case ISD::FSUB: - // Increase the cost for half and bfloat types if not architecturally - // supported. - if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) || - (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16())) - return 2 * LT.first; if (!Ty->getScalarType()->isFP128Ty()) return LT.first; [[fallthrough]]; @@ -4371,25 +4414,21 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost( } if (Opcode == Instruction::FCmp) { - // Without dedicated instructions we promote f16 + bf16 compares to f32. - if ((!ST->hasFullFP16() && ValTy->getScalarType()->isHalfTy()) || - ValTy->getScalarType()->isBFloatTy()) { - Type *PromotedTy = - ValTy->getWithNewType(Type::getFloatTy(ValTy->getContext())); - InstructionCost Cost = - getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy, - TTI::CastContextHint::None, CostKind); - if (!Op1Info.isConstant() && !Op2Info.isConstant()) - Cost *= 2; - Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind, - Op1Info, Op2Info); - if (ValTy->isVectorTy()) - Cost += getCastInstrCost( - Instruction::Trunc, VectorType::getInteger(cast<VectorType>(ValTy)), - VectorType::getInteger(cast<VectorType>(PromotedTy)), - TTI::CastContextHint::None, CostKind); - return Cost; - } + if (auto PromotedCost = getFP16BF16PromoteCost( + ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false, + [&](Type *PromotedTy) { + InstructionCost Cost = + getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, + CostKind, Op1Info, Op2Info); + if (isa<VectorType>(PromotedTy)) + Cost += getCastInstrCost( + Instruction::Trunc, + VectorType::getInteger(cast<VectorType>(ValTy)), + VectorType::getInteger(cast<VectorType>(PromotedTy)), + TTI::CastContextHint::None, CostKind); + return Cost; + })) + return *PromotedCost; auto LT = getTypeLegalizationCost(ValTy); // Model unknown fp compares as a libcall. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 7f45177..fa9b25a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -435,6 +435,14 @@ public: bool preferPredicatedReductionSelect() const override { return ST->hasSVE(); } + /// FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the + /// architecture features are not present. + std::optional<InstructionCost> + getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, + TTI::OperandValueInfo Op1Info, + TTI::OperandValueInfo Op2Info, bool IncludeTrunc, + std::function<InstructionCost(Type *)> InstCost) const; + InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 2a324e5..626734a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -997,89 +997,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const Function &F = MF.getFunction(); // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave - // dispatch registers are function args. - unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; - - if (isShader(F.getCallingConv())) { - bool IsPixelShader = - F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); - - // Calculate the number of VGPR registers based on the SPI input registers - uint32_t InputEna = 0; - uint32_t InputAddr = 0; - unsigned LastEna = 0; - - if (IsPixelShader) { - // Note for IsPixelShader: - // By this stage, all enabled inputs are tagged in InputAddr as well. - // We will use InputAddr to determine whether the input counts against the - // vgpr total and only use the InputEnable to determine the last input - // that is relevant - if extra arguments are used, then we have to honour - // the InputAddr for any intermediate non-enabled inputs. - InputEna = MFI->getPSInputEnable(); - InputAddr = MFI->getPSInputAddr(); - - // We only need to consider input args up to the last used arg. - assert((InputEna || InputAddr) && - "PSInputAddr and PSInputEnable should " - "never both be 0 for AMDGPU_PS shaders"); - // There are some rare circumstances where InputAddr is non-zero and - // InputEna can be set to 0. In this case we default to setting LastEna - // to 1. - LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1; - } + // dispatch registers as function args. + unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(), + WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs(); - // FIXME: We should be using the number of registers determined during - // calling convention lowering to legalize the types. - const DataLayout &DL = F.getDataLayout(); - unsigned PSArgCount = 0; - unsigned IntermediateVGPR = 0; - for (auto &Arg : F.args()) { - unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32; - if (Arg.hasAttribute(Attribute::InReg)) { - WaveDispatchNumSGPR += NumRegs; - } else { - // If this is a PS shader and we're processing the PS Input args (first - // 16 VGPR), use the InputEna and InputAddr bits to define how many - // VGPRs are actually used. - // Any extra VGPR arguments are handled as normal arguments (and - // contribute to the VGPR count whether they're used or not). - if (IsPixelShader && PSArgCount < 16) { - if ((1 << PSArgCount) & InputAddr) { - if (PSArgCount < LastEna) - WaveDispatchNumVGPR += NumRegs; - else - IntermediateVGPR += NumRegs; - } - PSArgCount++; - } else { - // If there are extra arguments we have to include the allocation for - // the non-used (but enabled with InputAddr) input arguments - if (IntermediateVGPR) { - WaveDispatchNumVGPR += IntermediateVGPR; - IntermediateVGPR = 0; - } - WaveDispatchNumVGPR += NumRegs; - } - } - } + if (WaveDispatchNumSGPR) { ProgInfo.NumSGPR = AMDGPUMCExpr::createMax( - {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx); + {ProgInfo.NumSGPR, + MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs, + Ctx)}, + Ctx); + } + if (WaveDispatchNumVGPR) { ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax( {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx); ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); - } else if (isKernel(F.getCallingConv()) && - MFI->getNumKernargPreloadedSGPRs()) { - // Consider cases where the total number of UserSGPRs with trailing - // allocated preload SGPRs, is greater than the number of explicitly - // referenced SGPRs. - const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd( - CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx); - ProgInfo.NumSGPR = - AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx); } // Adjust number of registers used to meet default/requested minimum/maximum diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 3d8d274..64a9bde 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( ++i; } + if (Info->getNumKernargPreloadedSGPRs()) + Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs()); + TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); return true; @@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (!determineAssignments(Assigner, SplitArgs, CCInfo)) return false; + if (IsEntryFunc) { + // This assumes the registers are allocated by CCInfo in ascending order + // with no gaps. + Info->setNumWaveDispatchSGPRs( + CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters())); + Info->setNumWaveDispatchVGPRs( + CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters())); + } + FormalArgHandler Handler(B, MRI); if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B)) return false; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 334afd3..ef63acc 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -368,46 +368,45 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, //////////////////////////////////////////////////////////////////////////////// // GCNRPTarget -GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { +GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { const Function &F = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF); + setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F)); } GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, - const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { - setRegLimits(NumSGPRs, NumVGPRs, MF); + const MachineFunction &MF, const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { + setTarget(NumSGPRs, NumVGPRs); } GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { + const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned DynamicVGPRBlockSize = MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); - setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false), - ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF); + setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false), + ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize)); } -void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs, - const MachineFunction &MF) { +void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - unsigned DynamicVGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs); MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs); - MaxUnifiedVGPRs = - ST.hasGFX90AInsts() - ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs) - : 0; + if (UnifiedRF) { + unsigned DynamicVGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + MaxUnifiedVGPRs = + std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs); + } else { + MaxUnifiedVGPRs = 0; + } } -bool GCNRPTarget::isSaveBeneficial(Register Reg, - const MachineRegisterInfo &MRI) const { +bool GCNRPTarget::isSaveBeneficial(Register Reg) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterClass *RC = MRI.getRegClass(Reg); const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); @@ -416,16 +415,19 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg, return RP.getSGPRNum() > MaxSGPRs; unsigned NumVGPRs = SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum(); - return isVGPRBankSaveBeneficial(NumVGPRs); + // The addressable limit must always be respected. + if (NumVGPRs > MaxVGPRs) + return true; + // For unified RFs, combined VGPR usage limit must be respected as well. + return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs; } bool GCNRPTarget::satisfied() const { - if (RP.getSGPRNum() > MaxSGPRs) + if (RP.getSGPRNum() > MaxSGPRs || RP.getVGPRNum(false) > MaxVGPRs) return false; - if (RP.getVGPRNum(false) > MaxVGPRs && - (!CombineVGPRSavings || !satisifiesVGPRBanksTarget())) + if (UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs) return false; - return satisfiesUnifiedTarget(); + return true; } /////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index ea33a22..a9c58bb 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -186,20 +186,22 @@ public: /// Sets up the target such that the register pressure starting at \p RP does /// not show register spilling on function \p MF (w.r.t. the function's /// mininum target occupancy). - GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings = false); + GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP); /// Sets up the target such that the register pressure starting at \p RP does /// not use more than \p NumSGPRs SGPRs and \p NumVGPRs VGPRs on function \p /// MF. GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings = false); + const GCNRegPressure &RP); /// Sets up the target such that the register pressure starting at \p RP does /// not prevent achieving an occupancy of at least \p Occupancy on function /// \p MF. GCNRPTarget(unsigned Occupancy, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings = false); + const GCNRegPressure &RP); + + /// Changes the target (same semantics as constructor). + void setTarget(unsigned NumSGPRs, unsigned NumVGPRs); const GCNRegPressure &getCurrentRP() const { return RP; } @@ -207,7 +209,7 @@ public: /// Determines whether saving virtual register \p Reg will be beneficial /// towards achieving the RP target. - bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const; + bool isSaveBeneficial(Register Reg) const; /// Saves virtual register \p Reg with lanemask \p Mask. void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) { @@ -227,15 +229,15 @@ public: if (Target.MaxUnifiedVGPRs) { OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs << " VGPRs (unified)"; - } else if (Target.CombineVGPRSavings) { - OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/' - << 2 * Target.MaxVGPRs << " VGPRs (combined target)"; } return OS; } #endif private: + const MachineFunction &MF; + const bool UnifiedRF; + /// Current register pressure. GCNRegPressure RP; @@ -246,29 +248,10 @@ private: /// Target number of overall VGPRs for subtargets with unified RFs. Always 0 /// for subtargets with non-unified RFs. unsigned MaxUnifiedVGPRs; - /// Whether we consider that the register allocator will be able to swap - /// between ArchVGPRs and AGPRs by copying them to a super register class. - /// Concretely, this allows savings in one of the VGPR banks to help toward - /// savings in the other VGPR bank. - bool CombineVGPRSavings; - - inline bool satisifiesVGPRBanksTarget() const { - assert(CombineVGPRSavings && "only makes sense with combined savings"); - return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs; - } - - /// Always satisified when the subtarget doesn't have a unified RF. - inline bool satisfiesUnifiedTarget() const { - return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs; - } - - inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const { - return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() || - (CombineVGPRSavings && !satisifiesVGPRBanksTarget()); - } - void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs, - const MachineFunction &MF); + GCNRPTarget(const GCNRegPressure &RP, const MachineFunction &MF) + : MF(MF), UnifiedRF(MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()), + RP(RP) {} }; /////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 96d5668..254b75b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1086,7 +1086,8 @@ bool ClusteredLowOccStage::initGCNSchedStage() { } /// Allows to easily filter for this stage's debug output. -#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;) +#define REMAT_PREFIX "[PreRARemat] " +#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;) bool PreRARematStage::initGCNSchedStage() { // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for @@ -1115,10 +1116,15 @@ bool PreRARematStage::initGCNSchedStage() { rematerialize(); if (GCNTrackers) DAG.RegionLiveOuts.buildLiveRegMap(); - REMAT_DEBUG( - dbgs() << "Retrying function scheduling with new min. occupancy of " - << AchievedOcc << " from rematerializing (original was " - << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n"); + REMAT_DEBUG({ + dbgs() << "Retrying function scheduling with new min. occupancy of " + << AchievedOcc << " from rematerializing (original was " + << DAG.MinOccupancy; + if (TargetOcc) + dbgs() << ", target was " << *TargetOcc; + dbgs() << ")\n"; + }); + if (AchievedOcc > DAG.MinOccupancy) { DAG.MinOccupancy = AchievedOcc; SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); @@ -1540,8 +1546,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { return GCNSchedStage::shouldRevertScheduling(WavesAfter) || - mayCauseSpilling(WavesAfter) || - (IncreaseOccupancy && WavesAfter < TargetOcc); + mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc); } bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { @@ -1687,78 +1692,63 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, } bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { - REMAT_DEBUG({ - dbgs() << "Collecting rematerializable instructions in "; - MF.getFunction().printAsOperand(dbgs(), false); - dbgs() << '\n'; - }); + const Function &F = MF.getFunction(); // Maps optimizable regions (i.e., regions at minimum and register-limited // occupancy, or regions with spilling) to the target RP we would like to // reach. DenseMap<unsigned, GCNRPTarget> OptRegions; - const Function &F = MF.getFunction(); - unsigned DynamicVGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); - - std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F); - const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F); - const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F); - const unsigned MaxSGPRsIncOcc = - ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false); - const unsigned MaxVGPRsIncOcc = - ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize); - IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy; - - // Collect optimizable regions. If there is spilling in any region we will - // just try to reduce spilling. Otherwise we will try to increase occupancy by - // one in the whole function. - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - GCNRegPressure &RP = DAG.Pressure[I]; - // We allow ArchVGPR or AGPR savings to count as savings of the other kind - // of VGPR only when trying to eliminate spilling. We cannot do this when - // trying to increase occupancy since VGPR class swaps only occur later in - // the register allocator i.e., the scheduler will not be able to reason - // about these savings and will not report an increase in the achievable - // occupancy, triggering rollbacks. - GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP, - /*CombineVGPRSavings=*/true); - if (!Target.satisfied() && IncreaseOccupancy) { - // There is spilling in the region and we were so far trying to increase - // occupancy. Strop trying that and focus on reducing spilling. - IncreaseOccupancy = false; - OptRegions.clear(); - } else if (IncreaseOccupancy) { - // There is no spilling in the region, try to increase occupancy. - Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP, - /*CombineVGPRSavings=*/false); + unsigned MaxSGPRs = ST.getMaxNumSGPRs(F); + unsigned MaxVGPRs = ST.getMaxNumVGPRs(F); + auto ResetTargetRegions = [&]() { + OptRegions.clear(); + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + const GCNRegPressure &RP = DAG.Pressure[I]; + GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP); + if (!Target.satisfied()) + OptRegions.insert({I, Target}); } - if (!Target.satisfied()) - OptRegions.insert({I, Target}); - } - if (OptRegions.empty()) - return false; + }; -#ifndef NDEBUG - if (IncreaseOccupancy) { - REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy - << ") in regions:\n"); + ResetTargetRegions(); + if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) { + // In addition to register usage being above addressable limits, occupancy + // below the minimum is considered like "spilling" as well. + TargetOcc = std::nullopt; } else { - REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy (" - << WavesPerEU.first << ") in regions:\n"); - } - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) - REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n'); + // There is no spilling and room to improve occupancy; set up "increased + // occupancy targets" for all regions. + TargetOcc = DAG.MinOccupancy + 1; + unsigned VGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false); + MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize); + ResetTargetRegions(); } -#endif - - // When we are reducing spilling, the target is the minimum target number of - // waves/EU determined by the subtarget. In cases where either one of - // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current - // minimum region occupancy may be higher than the latter. - TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 - : std::max(DAG.MinOccupancy, WavesPerEU.first); + REMAT_DEBUG({ + dbgs() << "Analyzing "; + MF.getFunction().printAsOperand(dbgs(), false); + dbgs() << ": "; + if (OptRegions.empty()) { + dbgs() << "no objective to achieve, occupancy is maximal at " + << MFI.getMaxWavesPerEU(); + } else if (!TargetOcc) { + dbgs() << "reduce spilling (minimum target occupancy is " + << MFI.getMinWavesPerEU() << ')'; + } else { + dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to " + << TargetOcc; + } + dbgs() << '\n'; + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) { + dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond() + << '\n'; + } + } + }); + if (OptRegions.empty()) + return false; // Accounts for a reduction in RP in an optimizable region. Returns whether we // estimate that we have identified enough rematerialization opportunities to @@ -1767,7 +1757,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask, bool &Progress) -> bool { GCNRPTarget &Target = OptIt->getSecond(); - if (!Target.isSaveBeneficial(Reg, DAG.MRI)) + if (!Target.isSaveBeneficial(Reg)) return false; Progress = true; Target.saveReg(Reg, Mask, DAG.MRI); @@ -1876,7 +1866,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { } } - if (IncreaseOccupancy) { + if (TargetOcc) { // We were trying to increase occupancy but failed, abort the stage. REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n"); Rematerializations.clear(); @@ -1979,7 +1969,9 @@ void PreRARematStage::rematerialize() { // All regions impacted by at least one rematerialization must be rescheduled. // Maximum pressure must also be recomputed for all regions where it changed // non-predictably and checked against the target occupancy. - AchievedOcc = TargetOcc; + unsigned DynamicVGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + AchievedOcc = MFI.getMaxWavesPerEU(); for (auto &[I, OriginalRP] : ImpactedRegions) { bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second; RescheduleRegions[I] = !IsEmptyRegion; @@ -2003,9 +1995,8 @@ void PreRARematStage::rematerialize() { } } DAG.Pressure[I] = RP; - AchievedOcc = std::min( - AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>() - ->getDynamicVGPRBlockSize())); + AchievedOcc = + std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize)); } REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); } @@ -2035,7 +2026,7 @@ void PreRARematStage::finalizeGCNSchedStage() { // which case we do not want to rollback either (the rescheduling was already // reverted in PreRARematStage::shouldRevertScheduling in such cases). unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy); - if (!IncreaseOccupancy || MaxOcc >= TargetOcc) + if (!TargetOcc || MaxOcc >= *TargetOcc) return; REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n"); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 32139a9..790370f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -470,15 +470,12 @@ private: /// After successful stage initialization, indicates which regions should be /// rescheduled. BitVector RescheduleRegions; - /// Target occupancy the stage estimates is reachable through - /// rematerialization. Greater than or equal to the pre-stage min occupancy. - unsigned TargetOcc; + /// The target occupancy the stage is trying to achieve. Empty when the + /// objective is spilling reduction. + std::optional<unsigned> TargetOcc; /// Achieved occupancy *only* through rematerializations (pre-rescheduling). /// Smaller than or equal to the target occupancy. unsigned AchievedOcc; - /// Whether the stage is attempting to increase occupancy in the abscence of - /// spilling. - bool IncreaseOccupancy; /// Returns whether remat can reduce spilling or increase function occupancy /// by 1 through rematerialization. If it can do one, collects instructions in diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5b327fb..1b7d65a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments( if (!IsKernel) { CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); CCInfo.AnalyzeFormalArguments(Splits, AssignFn); + + // This assumes the registers are allocated by CCInfo in ascending order + // with no gaps. + Info->setNumWaveDispatchSGPRs( + CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters())); + Info->setNumWaveDispatchVGPRs( + CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters())); + } else if (Info->getNumKernargPreloadedSGPRs()) { + Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs()); } SmallVector<SDValue, 16> Chains; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 9a1448f..49425d5 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), HasSpilledSGPRs(MFI.hasSpilledSGPRs()), HasSpilledVGPRs(MFI.hasSpilledVGPRs()), + NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()), + NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()), HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), Occupancy(MFI.getOccupancy()), ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), @@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( WaveLimiter = YamlMFI.WaveLimiter; HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs; HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs; + NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs; + NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs; BytesInStackArgArea = YamlMFI.BytesInStackArgArea; ReturnsVoid = YamlMFI.ReturnsVoid; IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 08b0206..ca8f803 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -270,6 +270,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { bool WaveLimiter = false; bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; + uint16_t NumWaveDispatchSGPRs = 0; + uint16_t NumWaveDispatchVGPRs = 0; uint32_t HighBitsOf32BitAddress = 0; // TODO: 10 may be a better default since it's the maximum. @@ -327,6 +329,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false); YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false); + YamlIO.mapOptional("numWaveDispatchSGPRs", MFI.NumWaveDispatchSGPRs, false); + YamlIO.mapOptional("numWaveDispatchVGPRs", MFI.NumWaveDispatchVGPRs, false); YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg, StringValue("$private_rsrc_reg")); YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg, @@ -465,6 +469,9 @@ private: unsigned NumUserSGPRs = 0; unsigned NumSystemSGPRs = 0; + unsigned NumWaveDispatchSGPRs = 0; + unsigned NumWaveDispatchVGPRs = 0; + bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; bool HasNonSpillStackObjects = false; @@ -991,6 +998,14 @@ public: return UserSGPRInfo.getNumKernargPreloadSGPRs(); } + unsigned getNumWaveDispatchSGPRs() const { return NumWaveDispatchSGPRs; } + + void setNumWaveDispatchSGPRs(unsigned Count) { NumWaveDispatchSGPRs = Count; } + + unsigned getNumWaveDispatchVGPRs() const { return NumWaveDispatchVGPRs; } + + void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; } + Register getPrivateSegmentWaveByteOffsetSystemSGPR() const { return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e4aa8b8..e63b937 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1844,6 +1844,17 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3, /*IsStore*/ true, /*IsUnitStrided*/ false, /*UsePtrVal*/ true); + case Intrinsic::riscv_sseg2_store_mask: + case Intrinsic::riscv_sseg3_store_mask: + case Intrinsic::riscv_sseg4_store_mask: + case Intrinsic::riscv_sseg5_store_mask: + case Intrinsic::riscv_sseg6_store_mask: + case Intrinsic::riscv_sseg7_store_mask: + case Intrinsic::riscv_sseg8_store_mask: + // Operands are (vec, ..., vec, ptr, offset, mask, vl) + return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 4, + /*IsStore*/ true, + /*IsUnitStrided*/ false, /*UsePtrVal*/ true); case Intrinsic::riscv_vlm: return SetRVVLoadStoreInfo(/*PtrOp*/ 0, /*IsStore*/ false, @@ -11084,69 +11095,118 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return lowerVectorIntrinsicScalars(Op, DAG, Subtarget); } -SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, - SelectionDAG &DAG) const { - unsigned IntNo = Op.getConstantOperandVal(1); +static SDValue +lowerFixedVectorSegStoreIntrinsics(unsigned IntNo, SDValue Op, + const RISCVSubtarget &Subtarget, + SelectionDAG &DAG) { + bool IsStrided; switch (IntNo) { - default: - break; case Intrinsic::riscv_seg2_store_mask: case Intrinsic::riscv_seg3_store_mask: case Intrinsic::riscv_seg4_store_mask: case Intrinsic::riscv_seg5_store_mask: case Intrinsic::riscv_seg6_store_mask: case Intrinsic::riscv_seg7_store_mask: - case Intrinsic::riscv_seg8_store_mask: { - SDLoc DL(Op); - static const Intrinsic::ID VssegInts[] = { - Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, - Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, - Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, - Intrinsic::riscv_vsseg8_mask}; + case Intrinsic::riscv_seg8_store_mask: + IsStrided = false; + break; + case Intrinsic::riscv_sseg2_store_mask: + case Intrinsic::riscv_sseg3_store_mask: + case Intrinsic::riscv_sseg4_store_mask: + case Intrinsic::riscv_sseg5_store_mask: + case Intrinsic::riscv_sseg6_store_mask: + case Intrinsic::riscv_sseg7_store_mask: + case Intrinsic::riscv_sseg8_store_mask: + IsStrided = true; + break; + default: + llvm_unreachable("unexpected intrinsic ID"); + } - // Operands: (chain, int_id, vec*, ptr, mask, vl) - unsigned NF = Op->getNumOperands() - 5; - assert(NF >= 2 && NF <= 8 && "Unexpected seg number"); - MVT XLenVT = Subtarget.getXLenVT(); - MVT VT = Op->getOperand(2).getSimpleValueType(); - MVT ContainerVT = getContainerForFixedLengthVector(VT); - unsigned Sz = NF * ContainerVT.getVectorMinNumElements() * - ContainerVT.getScalarSizeInBits(); - EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF); + SDLoc DL(Op); + static const Intrinsic::ID VssegInts[] = { + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask}; + static const Intrinsic::ID VsssegInts[] = { + Intrinsic::riscv_vssseg2_mask, Intrinsic::riscv_vssseg3_mask, + Intrinsic::riscv_vssseg4_mask, Intrinsic::riscv_vssseg5_mask, + Intrinsic::riscv_vssseg6_mask, Intrinsic::riscv_vssseg7_mask, + Intrinsic::riscv_vssseg8_mask}; + + // Operands: (chain, int_id, vec*, ptr, mask, vl) or + // (chain, int_id, vec*, ptr, stride, mask, vl) + unsigned NF = Op->getNumOperands() - (IsStrided ? 6 : 5); + assert(NF >= 2 && NF <= 8 && "Unexpected seg number"); + MVT XLenVT = Subtarget.getXLenVT(); + MVT VT = Op->getOperand(2).getSimpleValueType(); + MVT ContainerVT = ::getContainerForFixedLengthVector(DAG, VT, Subtarget); + unsigned Sz = NF * ContainerVT.getVectorMinNumElements() * + ContainerVT.getScalarSizeInBits(); + EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF); - SDValue VL = Op.getOperand(Op.getNumOperands() - 1); - SDValue Mask = Op.getOperand(Op.getNumOperands() - 2); - MVT MaskVT = Mask.getSimpleValueType(); - MVT MaskContainerVT = - ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget); - Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget); + SDValue VL = Op.getOperand(Op.getNumOperands() - 1); + SDValue Mask = Op.getOperand(Op.getNumOperands() - 2); + MVT MaskVT = Mask.getSimpleValueType(); + MVT MaskContainerVT = + ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget); + Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget); - SDValue IntID = DAG.getTargetConstant(VssegInts[NF - 2], DL, XLenVT); - SDValue Ptr = Op->getOperand(NF + 2); + SDValue IntID = DAG.getTargetConstant( + IsStrided ? VsssegInts[NF - 2] : VssegInts[NF - 2], DL, XLenVT); + SDValue Ptr = Op->getOperand(NF + 2); - auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op); + auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op); - SDValue StoredVal = DAG.getUNDEF(VecTupTy); - for (unsigned i = 0; i < NF; i++) - StoredVal = DAG.getNode( - RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal, - convertToScalableVector( - ContainerVT, FixedIntrinsic->getOperand(2 + i), DAG, Subtarget), - DAG.getTargetConstant(i, DL, MVT::i32)); + SDValue StoredVal = DAG.getUNDEF(VecTupTy); + for (unsigned i = 0; i < NF; i++) + StoredVal = DAG.getNode( + RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal, + convertToScalableVector(ContainerVT, FixedIntrinsic->getOperand(2 + i), + DAG, Subtarget), + DAG.getTargetConstant(i, DL, MVT::i32)); + + SmallVector<SDValue, 10> Ops = { + FixedIntrinsic->getChain(), + IntID, + StoredVal, + Ptr, + Mask, + VL, + DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)}; + // Insert the stride operand. + if (IsStrided) + Ops.insert(std::next(Ops.begin(), 4), + Op.getOperand(Op.getNumOperands() - 3)); + + return DAG.getMemIntrinsicNode( + ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops, + FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand()); +} + +SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntNo = Op.getConstantOperandVal(1); + switch (IntNo) { + default: + break; + case Intrinsic::riscv_seg2_store_mask: + case Intrinsic::riscv_seg3_store_mask: + case Intrinsic::riscv_seg4_store_mask: + case Intrinsic::riscv_seg5_store_mask: + case Intrinsic::riscv_seg6_store_mask: + case Intrinsic::riscv_seg7_store_mask: + case Intrinsic::riscv_seg8_store_mask: + case Intrinsic::riscv_sseg2_store_mask: + case Intrinsic::riscv_sseg3_store_mask: + case Intrinsic::riscv_sseg4_store_mask: + case Intrinsic::riscv_sseg5_store_mask: + case Intrinsic::riscv_sseg6_store_mask: + case Intrinsic::riscv_sseg7_store_mask: + case Intrinsic::riscv_sseg8_store_mask: + return lowerFixedVectorSegStoreIntrinsics(IntNo, Op, Subtarget, DAG); - SDValue Ops[] = { - FixedIntrinsic->getChain(), - IntID, - StoredVal, - Ptr, - Mask, - VL, - DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)}; - - return DAG.getMemIntrinsicNode( - ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops, - FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand()); - } case Intrinsic::riscv_sf_vc_xv_se: return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_XV_SE); case Intrinsic::riscv_sf_vc_iv_se: diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index 5541506..24ebbc3 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -524,16 +524,33 @@ foreach mx = SchedMxListW in { foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>; + let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in { + defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>; + } + + // Latency of vsmul: e8/e16 = 4/4/5/8, e32 = 5/5/5/8, e64 = 7/8/16/32 + // We use the worst-case until we can split the SEW. + defvar VSMulLat = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c; + // Latency of vsmul: e8/e16/e32 = 1/2/4/8, e64 = 4/8/16/32 + // We use the worst-case until we can split the SEW. + defvar VSMulOcc = ConstValueUntilLMULThenDoubleBase<"M1", 1, 4, mx>.c; + // TODO: change WriteVSMulV/X to be defined with LMULSEWSchedWrites + let Latency = VSMulLat, ReleaseAtCycles = [VSMulOcc] in { + defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>; + } + + defvar VSShiftLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + defvar VSShiftOcc = ConstOneUntilMF2ThenDouble<mx>.c; + let Latency = VSShiftLat, ReleaseAtCycles = [VSShiftOcc] in { + defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>; + } } // 13. Vector Floating-Point Instructions diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index fe0f308..b17cf17 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -3042,7 +3042,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *V = LHS; unsigned MaskElems = Mask.size(); auto *SrcTy = cast<FixedVectorType>(V->getType()); - unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedValue(); + unsigned VecBitWidth = DL.getTypeSizeInBits(SrcTy); unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType()); assert(SrcElemBitWidth && "vector elements must have a bitwidth"); unsigned SrcNumElems = SrcTy->getNumElements(); diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp index 0ddc231..e9bf59c 100644 --- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp +++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp @@ -58,14 +58,55 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) { } // Compute alignment from known bits. + auto InferFromKnownBits = [&](Instruction &I, Value *PtrOp) { + KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT); + unsigned TrailZ = + std::min(Known.countMinTrailingZeros(), +Value::MaxAlignmentExponent); + return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ)); + }; + + // Propagate alignment between loads and stores that originate from the + // same base pointer. + DenseMap<Value *, Align> BestBasePointerAligns; + auto InferFromBasePointer = [&](Value *PtrOp, Align LoadStoreAlign) { + APInt OffsetFromBase(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0); + PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase, true); + // Derive the base pointer alignment from the load/store alignment + // and the offset from the base pointer. + Align BasePointerAlign = + commonAlignment(LoadStoreAlign, OffsetFromBase.getLimitedValue()); + + auto [It, Inserted] = + BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign); + if (!Inserted) { + // If the stored base pointer alignment is better than the + // base pointer alignment we derived, we may be able to use it + // to improve the load/store alignment. If not, store the + // improved base pointer alignment for future iterations. + if (It->second > BasePointerAlign) { + Align BetterLoadStoreAlign = + commonAlignment(It->second, OffsetFromBase.getLimitedValue()); + return BetterLoadStoreAlign; + } + It->second = BasePointerAlign; + } + return LoadStoreAlign; + }; + for (BasicBlock &BB : F) { + // We need to reset the map for each block because alignment information + // can only be propagated from instruction A to B if A dominates B. + // This is because control flow (and exception throwing) could be dependent + // on the address (and its alignment) at runtime. Some sort of dominator + // tree approach could be better, but doing a simple forward pass through a + // single basic block is correct too. + BestBasePointerAligns.clear(); + for (Instruction &I : BB) { Changed |= tryToImproveAlign( DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) { - KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT); - unsigned TrailZ = std::min(Known.countMinTrailingZeros(), - +Value::MaxAlignmentExponent); - return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ)); + return std::max(InferFromKnownBits(I, PtrOp), + InferFromBasePointer(PtrOp, OldAlign)); }); } } diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index fcdb8a9..c68149b 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -263,6 +263,7 @@ static bool isUniformShape(Value *V) { case llvm::Instruction::FPExt: return true; case llvm::Instruction::AddrSpaceCast: + case CastInst::PtrToAddr: case CastInst::PtrToInt: case CastInst::IntToPtr: return false; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index be00fd6..1ac84ef 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -548,9 +548,6 @@ public: protected: friend class LoopVectorizationPlanner; - /// Returns (and creates if needed) the trip count of the widened loop. - Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); - // Create a check to see if the vector loop should be executed Value *createIterationCountCheck(ElementCount VF, unsigned UF) const; @@ -2272,56 +2269,6 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { return TTI.enableMaskedInterleavedAccessVectorization(); } -Value * -InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { - if (VectorTripCount) - return VectorTripCount; - - Value *TC = getTripCount(); - IRBuilder<> Builder(InsertBlock->getTerminator()); - - Type *Ty = TC->getType(); - // This is where we can make the step a runtime constant. - Value *Step = createStepForVF(Builder, Ty, VF, UF); - - // If the tail is to be folded by masking, round the number of iterations N - // up to a multiple of Step instead of rounding down. This is done by first - // adding Step-1 and then rounding down. Note that it's ok if this addition - // overflows: the vector induction variable will eventually wrap to zero given - // that it starts at zero and its Step is a power of two; the loop will then - // exit, with the last early-exit vector comparison also producing all-true. - // For scalable vectors the VF is not guaranteed to be a power of 2, but this - // is accounted for in emitIterationCountCheck that adds an overflow check. - if (Cost->foldTailByMasking()) { - assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && - "VF*UF must be a power of 2 when folding tail by masking"); - TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)), - "n.rnd.up"); - } - - // Now we need to generate the expression for the part of the loop that the - // vectorized body will execute. This is equal to N - (N % Step) if scalar - // iterations are not required for correctness, or N - Step, otherwise. Step - // is equal to the vectorization factor (number of SIMD elements) times the - // unroll factor (number of SIMD instructions). - Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); - - // There are cases where we *must* run at least one iteration in the remainder - // loop. See the cost model for when this can happen. If the step evenly - // divides the trip count, we set the remainder to be equal to the step. If - // the step does not evenly divide the trip count, no adjustment is necessary - // since there will already be scalar iterations. Note that the minimum - // iterations check ensures that N >= Step. - if (Cost->requiresScalarEpilogue(VF.isVector())) { - auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); - R = Builder.CreateSelect(IsZero, Step, R); - } - - VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); - - return VectorTripCount; -} - void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { // Note: The block with the minimum trip-count check is already connected // during earlier VPlan construction. @@ -7354,6 +7301,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // Canonicalize EVL loops after regions are dissolved. VPlanTransforms::canonicalizeEVLLoops(BestVPlan); VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH); + VPlanTransforms::materializeVectorTripCount( + BestVPlan, VectorPH, CM.foldTailByMasking(), + CM.requiresScalarEpilogue(BestVF.isVector())); // Perform the actual loop transformation. VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan, @@ -7410,8 +7360,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - BestVPlan.prepareToExecute( - ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State); + BestVPlan.prepareToExecute(State); replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB); // Move check blocks to their final position. @@ -9407,13 +9356,6 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) { State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind, cast_if_present<BinaryOperator>(FPBinOp)); DerivedIV->setName(Name); - // If index is the vector trip count, the concrete value will only be set in - // prepareToExecute, leading to missed simplifications, e.g. if it is 0. - // TODO: Remove the special case for the vector trip count once it is computed - // in VPlan and can be used during VPlan simplification. - assert((DerivedIV != Index || - getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) && - "IV didn't need transforming?"); State.set(this, DerivedIV, VPLane(0)); } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 39011e7..ec06a21 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -12050,7 +12050,8 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL, for (auto [V, Op] : zip(VL, Operands.front())) { auto *I = dyn_cast<Instruction>(Op); if (!I || !I->hasOneUse()) { - FMACost += TTI.getInstructionCost(cast<Instruction>(V), CostKind); + if (auto *OpI = dyn_cast<Instruction>(V)) + FMACost += TTI.getInstructionCost(OpI, CostKind); if (I) FMACost += TTI.getInstructionCost(I, CostKind); continue; diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp index f32d57f..e414c12 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp @@ -81,6 +81,7 @@ LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes( case Instruction::Opcode::FPToUI: case Instruction::Opcode::FPToSI: case Instruction::Opcode::FPExt: + case Instruction::Opcode::PtrToAddr: case Instruction::Opcode::PtrToInt: case Instruction::Opcode::IntToPtr: case Instruction::Opcode::SIToFP: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 73babcc..a820b524 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -951,15 +951,9 @@ VPlan::~VPlan() { delete BackedgeTakenCount; } -void VPlan::prepareToExecute(Value *VectorTripCountV, VPTransformState &State) { - if (!VectorTripCount.getUnderlyingValue()) - VectorTripCount.setUnderlyingValue(VectorTripCountV); - else - assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV && - "VectorTripCount set earlier must much VectorTripCountV"); - +void VPlan::prepareToExecute(VPTransformState &State) { IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); - Type *TCTy = VectorTripCountV->getType(); + Type *TCTy = VPTypeAnalysis(*this).inferScalarType(getTripCount()); // FIXME: Model VF * UF computation completely in VPlan. unsigned UF = getUF(); if (VF.getNumUsers()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c42cdd5..6f098351 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2408,11 +2408,11 @@ public: // TODO: extend the masked interleaved-group support to reversed access. assert((!Mask || !IG->isReverse()) && "Reversed masked interleave-group not supported."); - for (unsigned i = 0; i < IG->getFactor(); ++i) - if (Instruction *I = IG->getMember(i)) { - if (I->getType()->isVoidTy()) + for (unsigned I = 0; I < IG->getFactor(); ++I) + if (Instruction *Inst = IG->getMember(I)) { + if (Inst->getType()->isVoidTy()) continue; - new VPValue(I, this); + new VPValue(Inst, this); } for (auto *SV : StoredValues) @@ -3969,7 +3969,7 @@ public: } /// Prepare the plan for execution, setting up the required live-in values. - void prepareToExecute(Value *VectorTripCount, VPTransformState &State); + void prepareToExecute(VPTransformState &State); /// Generate the IR code for this VPlan. void execute(VPTransformState *State); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 1c8bd6c..34b2abf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3278,6 +3278,67 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, BTC->replaceAllUsesWith(TCMO); } +void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, + VPBasicBlock *VectorPHVPBB, + bool TailByMasking, + bool RequiresScalarEpilogue) { + VPValue &VectorTC = Plan.getVectorTripCount(); + assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in"); + // There's nothing to do if there are no users of the vector trip count or its + // IR value has already been set. + if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue()) + return; + + VPValue *TC = Plan.getTripCount(); + Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC); + VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin()); + VPValue *Step = &Plan.getVFxUF(); + + // If the tail is to be folded by masking, round the number of iterations N + // up to a multiple of Step instead of rounding down. This is done by first + // adding Step-1 and then rounding down. Note that it's ok if this addition + // overflows: the vector induction variable will eventually wrap to zero given + // that it starts at zero and its Step is a power of two; the loop will then + // exit, with the last early-exit vector comparison also producing all-true. + // For scalable vectors the VF is not guaranteed to be a power of 2, but this + // is accounted for in emitIterationCountCheck that adds an overflow check. + if (TailByMasking) { + TC = Builder.createNaryOp( + Instruction::Add, + {TC, Builder.createNaryOp( + Instruction::Sub, + {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})}, + DebugLoc::getCompilerGenerated(), "n.rnd.up"); + } + + // Now we need to generate the expression for the part of the loop that the + // vectorized body will execute. This is equal to N - (N % Step) if scalar + // iterations are not required for correctness, or N - Step, otherwise. Step + // is equal to the vectorization factor (number of SIMD elements) times the + // unroll factor (number of SIMD instructions). + VPValue *R = + Builder.createNaryOp(Instruction::URem, {TC, Step}, + DebugLoc::getCompilerGenerated(), "n.mod.vf"); + + // There are cases where we *must* run at least one iteration in the remainder + // loop. See the cost model for when this can happen. If the step evenly + // divides the trip count, we set the remainder to be equal to the step. If + // the step does not evenly divide the trip count, no adjustment is necessary + // since there will already be scalar iterations. Note that the minimum + // iterations check ensures that N >= Step. + if (RequiresScalarEpilogue) { + assert(!TailByMasking && + "requiring scalar epilogue is not supported with fail folding"); + VPValue *IsZero = Builder.createICmp( + CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0))); + R = Builder.createSelect(IsZero, Step, R); + } + + VPValue *Res = Builder.createNaryOp( + Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec"); + VectorTC.replaceAllUsesWith(Res); +} + /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index cc50c75..2afe956 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -256,6 +256,12 @@ struct VPlanTransforms { unsigned BestUF, PredicatedScalarEvolution &PSE); + /// Materialize vector trip count computations to a set of VPInstructions. + static void materializeVectorTripCount(VPlan &Plan, + VPBasicBlock *VectorPHVPBB, + bool TailByMasking, + bool RequiresScalarEpilogue); + /// Materialize the backedge-taken count to be computed explicitly using /// VPInstructions. static void materializeBackedgeTakenCount(VPlan &Plan, |