diff options
Diffstat (limited to 'llvm/lib')
527 files changed, 14248 insertions, 6936 deletions
diff --git a/llvm/lib/Analysis/CallPrinter.cpp b/llvm/lib/Analysis/CallPrinter.cpp index 672dae1..99d8b11 100644 --- a/llvm/lib/Analysis/CallPrinter.cpp +++ b/llvm/lib/Analysis/CallPrinter.cpp @@ -70,7 +70,7 @@ public: for (Function &F : M->getFunctionList()) { uint64_t localSumFreq = 0; - SmallSet<Function *, 16> Callers; + SmallPtrSet<Function *, 16> Callers; for (User *U : F.users()) if (isa<CallInst>(U)) Callers.insert(cast<Instruction>(U)->getFunction()); @@ -99,7 +99,7 @@ private: bool FoundParallelEdge = true; while (FoundParallelEdge) { - SmallSet<Function *, 16> Visited; + SmallPtrSet<Function *, 16> Visited; FoundParallelEdge = false; for (auto CI = Node->begin(), CE = Node->end(); CI != CE; CI++) { if (!(Visited.insert(CI->second->getFunction())).second) { diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp index 076f417..b6acda3 100644 --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -359,6 +359,12 @@ UseCaptureInfo llvm::DetermineUseCaptureKind(const Use &U, const Value *Base) { case Instruction::AddrSpaceCast: // The original value is not captured via this if the new value isn't. return UseCaptureInfo::passthrough(); + case Instruction::PtrToAddr: + // We treat ptrtoaddr as a location-independent capture of the address even + // if it is ultimately not used. Continuing recursive analysis after + // ptrtoaddr would be possible, but we'd need logic to do that correctly, + // which is not the same as the current pointer following logic. + return CaptureComponents::Address; case Instruction::ICmp: { unsigned Idx = U.getOperandNo(); unsigned OtherIdx = 1 - Idx; @@ -399,7 +405,7 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker, SmallVector<const Use *, 20> Worklist; Worklist.reserve(getDefaultMaxUsesToExploreForCaptureTracking()); - SmallSet<const Use *, 20> Visited; + SmallPtrSet<const Use *, 20> Visited; auto AddUses = [&](const Value *V) { for (const Use &U : V->uses()) { diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index dd98b62..f44937a 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1485,6 +1485,9 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C, switch (Opcode) { default: llvm_unreachable("Missing case"); + case Instruction::PtrToAddr: + // TODO: Add some of the ptrtoint folds here as well. + break; case Instruction::PtrToInt: if (auto *CE = dyn_cast<ConstantExpr>(C)) { Constant *FoldedValue = nullptr; @@ -1659,6 +1662,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { case Intrinsic::aarch64_sve_convert_from_svbool: case Intrinsic::wasm_alltrue: case Intrinsic::wasm_anytrue: + case Intrinsic::wasm_dot: // WebAssembly float semantics are always known case Intrinsic::wasm_trunc_signed: case Intrinsic::wasm_trunc_unsigned: @@ -3989,6 +3993,30 @@ static Constant *ConstantFoldFixedVectorCall( } return ConstantVector::get(Result); } + case Intrinsic::wasm_dot: { + unsigned NumElements = + cast<FixedVectorType>(Operands[0]->getType())->getNumElements(); + + assert(NumElements == 8 && Result.size() == 4 && + "wasm dot takes i16x8 and produces i32x4"); + assert(Ty->isIntegerTy()); + int32_t MulVector[8]; + + for (unsigned I = 0; I < NumElements; ++I) { + ConstantInt *Elt0 = + cast<ConstantInt>(Operands[0]->getAggregateElement(I)); + ConstantInt *Elt1 = + cast<ConstantInt>(Operands[1]->getAggregateElement(I)); + + MulVector[I] = Elt0->getSExtValue() * Elt1->getSExtValue(); + } + for (unsigned I = 0; I < Result.size(); I++) { + int64_t IAdd = (int64_t)MulVector[I * 2] + (int64_t)MulVector[I * 2 + 1]; + Result[I] = ConstantInt::get(Ty, IAdd); + } + + return ConstantVector::get(Result); + } default: break; } diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index 629fa7cd..3a70666 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" +#include "llvm/Support/DXILABI.h" #include "llvm/Support/FormatVariadic.h" #include <cstdint> #include <optional> @@ -29,20 +30,6 @@ using namespace llvm; using namespace dxil; -static StringRef getResourceClassName(ResourceClass RC) { - switch (RC) { - case ResourceClass::SRV: - return "SRV"; - case ResourceClass::UAV: - return "UAV"; - case ResourceClass::CBuffer: - return "CBuffer"; - case ResourceClass::Sampler: - return "Sampler"; - } - llvm_unreachable("Unhandled ResourceClass"); -} - static StringRef getResourceKindName(ResourceKind RK) { switch (RK) { case ResourceKind::Texture1D: @@ -612,7 +599,12 @@ void ResourceTypeInfo::print(raw_ostream &OS, const DataLayout &DL) const { GlobalVariable *ResourceInfo::createSymbol(Module &M, StructType *Ty) { assert(!Symbol && "Symbol has already been created"); - Symbol = new GlobalVariable(M, Ty, /*isConstant=*/true, + Type *ResTy = Ty; + int64_t Size = Binding.Size; + if (Size != 1) + // unbounded arrays are represented as zero-sized arrays in LLVM IR + ResTy = ArrayType::get(Ty, Size == ~0u ? 0 : Size); + Symbol = new GlobalVariable(M, ResTy, /*isConstant=*/true, GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, Name); return Symbol; diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp index 329bd35..761c566 100644 --- a/llvm/lib/Analysis/Delinearization.cpp +++ b/llvm/lib/Analysis/Delinearization.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -32,6 +33,11 @@ using namespace llvm; #define DL_NAME "delinearize" #define DEBUG_TYPE DL_NAME +static cl::opt<bool> UseFixedSizeArrayHeuristic( + "delinearize-use-fixed-size-array-heuristic", cl::init(false), cl::Hidden, + cl::desc("When printing analysis, use the heuristic for fixed-size arrays " + "if the default delinearizetion fails.")); + // Return true when S contains at least an undef value. static inline bool containsUndefs(const SCEV *S) { return SCEVExprContains(S, [](const SCEV *S) { @@ -480,6 +486,184 @@ void llvm::delinearize(ScalarEvolution &SE, const SCEV *Expr, }); } +static std::optional<APInt> tryIntoAPInt(const SCEV *S) { + if (const auto *Const = dyn_cast<SCEVConstant>(S)) + return Const->getAPInt(); + return std::nullopt; +} + +/// Collects the absolute values of constant steps for all induction variables. +/// Returns true if we can prove that all step recurrences are constants and \p +/// Expr is divisible by \p ElementSize. Each step recurrence is stored in \p +/// Steps after divided by \p ElementSize. +static bool collectConstantAbsSteps(ScalarEvolution &SE, const SCEV *Expr, + SmallVectorImpl<uint64_t> &Steps, + uint64_t ElementSize) { + // End of recursion. The constant value also must be a multiple of + // ElementSize. + if (const auto *Const = dyn_cast<SCEVConstant>(Expr)) { + const uint64_t Mod = Const->getAPInt().urem(ElementSize); + return Mod == 0; + } + + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Expr); + if (!AR || !AR->isAffine()) + return false; + + const SCEV *Step = AR->getStepRecurrence(SE); + std::optional<APInt> StepAPInt = tryIntoAPInt(Step); + if (!StepAPInt) + return false; + + APInt Q; + uint64_t R; + APInt::udivrem(StepAPInt->abs(), ElementSize, Q, R); + if (R != 0) + return false; + + // Bail out when the step is too large. + std::optional<uint64_t> StepVal = Q.tryZExtValue(); + if (!StepVal) + return false; + + Steps.push_back(*StepVal); + return collectConstantAbsSteps(SE, AR->getStart(), Steps, ElementSize); +} + +bool llvm::findFixedSizeArrayDimensions(ScalarEvolution &SE, const SCEV *Expr, + SmallVectorImpl<uint64_t> &Sizes, + const SCEV *ElementSize) { + if (!ElementSize) + return false; + + std::optional<APInt> ElementSizeAPInt = tryIntoAPInt(ElementSize); + if (!ElementSizeAPInt || *ElementSizeAPInt == 0) + return false; + + std::optional<uint64_t> ElementSizeConst = ElementSizeAPInt->tryZExtValue(); + + // Early exit when ElementSize is not a positive constant. + if (!ElementSizeConst) + return false; + + if (!collectConstantAbsSteps(SE, Expr, Sizes, *ElementSizeConst) || + Sizes.empty()) { + Sizes.clear(); + return false; + } + + // At this point, Sizes contains the absolute step recurrences for all + // induction variables. Each step recurrence must be a multiple of the size of + // the array element. Assuming that the each value represents the size of an + // array for each dimension, attempts to restore the length of each dimension + // by dividing the step recurrence by the next smaller value. For example, if + // we have the following AddRec SCEV: + // + // AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8) + // + // Then Sizes will become [256, 32, 1] after sorted. We don't know the size of + // the outermost dimension, the next dimension will be computed as 256 / 32 = + // 8, and the last dimension will be computed as 32 / 1 = 32. Thus it results + // in like Arr[UnknownSize][8][32] with elements of size 8 bytes, where Arr is + // a base pointer. + // + // TODO: Catch more cases, e.g., when a step recurrence is not divisible by + // the next smaller one, like A[i][3*j]. + llvm::sort(Sizes.rbegin(), Sizes.rend()); + Sizes.erase(llvm::unique(Sizes), Sizes.end()); + + // The last element in Sizes should be ElementSize. At this point, all values + // in Sizes are assumed to be divided by ElementSize, so replace it with 1. + assert(Sizes.back() != 0 && "Unexpected zero size in Sizes."); + Sizes.back() = 1; + + for (unsigned I = 0; I + 1 < Sizes.size(); I++) { + uint64_t PrevSize = Sizes[I + 1]; + if (Sizes[I] % PrevSize) { + Sizes.clear(); + return false; + } + Sizes[I] /= PrevSize; + } + + // Finally, the last element in Sizes should be ElementSize. + Sizes.back() = *ElementSizeConst; + return true; +} + +/// Splits the SCEV into two vectors of SCEVs representing the subscripts and +/// sizes of an array access, assuming that the array is a fixed size array. +/// +/// E.g., if we have the code like as follows: +/// +/// double A[42][8][32]; +/// for i +/// for j +/// for k +/// use A[i][j][k] +/// +/// The access function will be represented as an AddRec SCEV like: +/// +/// AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8) +/// +/// Then findFixedSizeArrayDimensions infers the size of each dimension of the +/// array based on the fact that the value of the step recurrence is a multiple +/// of the size of the corresponding array element. In the above example, it +/// results in the following: +/// +/// CHECK: ArrayDecl[UnknownSize][8][32] with elements of 8 bytes. +/// +/// Finally each subscript will be computed as follows: +/// +/// CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>] +/// +/// Note that this function doesn't check the range of possible values for each +/// subscript, so the caller should perform additional boundary checks if +/// necessary. +/// +/// Also note that this function doesn't guarantee that the original array size +/// is restored "correctly". For example, in the following case: +/// +/// double A[42][4][64]; +/// double B[42][8][32]; +/// for i +/// for j +/// for k +/// use A[i][j][k] +/// use B[i][2*j][k] +/// +/// The access function for both accesses will be the same: +/// +/// AddRec: {{{0,+,2048}<%for.i>,+,512}<%for.j>,+,8}<%for.k> (ElementSize=8) +/// +/// The array sizes for both A and B will be computed as +/// ArrayDecl[UnknownSize][4][64], which matches for A, but not for B. +/// +/// TODO: At the moment, this function can handle only simple cases. For +/// example, we cannot handle a case where a step recurrence is not divisible +/// by the next smaller step recurrence, e.g., A[i][3*j]. +bool llvm::delinearizeFixedSizeArray(ScalarEvolution &SE, const SCEV *Expr, + SmallVectorImpl<const SCEV *> &Subscripts, + SmallVectorImpl<const SCEV *> &Sizes, + const SCEV *ElementSize) { + + // First step: find the fixed array size. + SmallVector<uint64_t, 4> ConstSizes; + if (!findFixedSizeArrayDimensions(SE, Expr, ConstSizes, ElementSize)) { + Sizes.clear(); + return false; + } + + // Convert the constant size to SCEV. + for (uint64_t Size : ConstSizes) + Sizes.push_back(SE.getConstant(Expr->getType(), Size)); + + // Second step: compute the access functions for each subscript. + computeAccessFunctions(SE, Expr, Subscripts, Sizes); + + return !Subscripts.empty(); +} + bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE, const GetElementPtrInst *GEP, SmallVectorImpl<const SCEV *> &Subscripts, @@ -586,9 +770,21 @@ void printDelinearization(raw_ostream &O, Function *F, LoopInfo *LI, O << "AccessFunction: " << *AccessFn << "\n"; SmallVector<const SCEV *, 3> Subscripts, Sizes; + + auto IsDelinearizationFailed = [&]() { + return Subscripts.size() == 0 || Sizes.size() == 0 || + Subscripts.size() != Sizes.size(); + }; + delinearize(*SE, AccessFn, Subscripts, Sizes, SE->getElementSize(&Inst)); - if (Subscripts.size() == 0 || Sizes.size() == 0 || - Subscripts.size() != Sizes.size()) { + if (UseFixedSizeArrayHeuristic && IsDelinearizationFailed()) { + Subscripts.clear(); + Sizes.clear(); + delinearizeFixedSizeArray(*SE, AccessFn, Subscripts, Sizes, + SE->getElementSize(&Inst)); + } + + if (IsDelinearizationFailed()) { O << "failed to delinearize\n"; continue; } diff --git a/llvm/lib/Analysis/DemandedBits.cpp b/llvm/lib/Analysis/DemandedBits.cpp index 6694d5c..e088175 100644 --- a/llvm/lib/Analysis/DemandedBits.cpp +++ b/llvm/lib/Analysis/DemandedBits.cpp @@ -76,6 +76,26 @@ void DemandedBits::determineLiveOperandBits( computeKnownBits(V2, Known2, DL, &AC, UserI, &DT); } }; + auto GetShiftedRange = [&](uint64_t Min, uint64_t Max, bool ShiftLeft) { + auto ShiftF = [ShiftLeft](const APInt &Mask, unsigned ShiftAmnt) { + return ShiftLeft ? Mask.shl(ShiftAmnt) : Mask.lshr(ShiftAmnt); + }; + AB = APInt::getZero(BitWidth); + uint64_t LoopRange = Max - Min; + APInt Mask = AOut; + APInt Shifted = AOut; // AOut | (AOut << 1) | ... | (AOut << (ShiftAmnt - 1) + for (unsigned ShiftAmnt = 1; ShiftAmnt <= LoopRange; ShiftAmnt <<= 1) { + if (LoopRange & ShiftAmnt) { + // Account for (LoopRange - ShiftAmnt, LoopRange] + Mask |= ShiftF(Shifted, LoopRange - ShiftAmnt + 1); + // Clears the low bit. + LoopRange -= ShiftAmnt; + } + // [0, ShiftAmnt) -> [0, ShiftAmnt * 2) + Shifted |= ShiftF(Shifted, ShiftAmnt); + } + AB = ShiftF(Mask, Min); + }; switch (UserI->getOpcode()) { default: break; @@ -183,6 +203,17 @@ void DemandedBits::determineLiveOperandBits( AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1); else if (S->hasNoUnsignedWrap()) AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt); + } else { + ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr); + uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1); + uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1); + // similar to Lshr case + GetShiftedRange(Min, Max, /*ShiftLeft=*/false); + const auto *S = cast<ShlOperator>(UserI); + if (S->hasNoSignedWrap()) + AB |= APInt::getHighBitsSet(BitWidth, Max + 1); + else if (S->hasNoUnsignedWrap()) + AB |= APInt::getHighBitsSet(BitWidth, Max); } } break; @@ -197,6 +228,24 @@ void DemandedBits::determineLiveOperandBits( // (they must be zero). if (cast<LShrOperator>(UserI)->isExact()) AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + } else { + ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr); + uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1); + uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1); + // Suppose AOut == 0b0000 0001 + // [min, max] = [1, 3] + // iteration 1 shift by 1 mask is 0b0000 0011 + // iteration 2 shift by 2 mask is 0b0000 1111 + // iteration 3, shiftAmnt = 4 > max - min, we stop. + // + // After the iterations we need one more shift by min, + // to move from 0b0000 1111 to --> 0b0001 1110. + // The loop populates the mask relative to (0,...,max-min), + // but we need coverage from (min, max). + // This is why the shift by min is needed. + GetShiftedRange(Min, Max, /*ShiftLeft=*/true); + if (cast<LShrOperator>(UserI)->isExact()) + AB |= APInt::getLowBitsSet(BitWidth, Max); } } break; @@ -217,6 +266,26 @@ void DemandedBits::determineLiveOperandBits( // (they must be zero). if (cast<AShrOperator>(UserI)->isExact()) AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + } else { + ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr); + uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1); + uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1); + GetShiftedRange(Min, Max, /*ShiftLeft=*/true); + if (Max && + (AOut & APInt::getHighBitsSet(BitWidth, Max)).getBoolValue()) { + // Suppose AOut = 0011 1100 + // [min, max] = [1, 3] + // ShiftAmount = 1 : Mask is 1000 0000 + // ShiftAmount = 2 : Mask is 1100 0000 + // ShiftAmount = 3 : Mask is 1110 0000 + // The Mask with Max covers every case in [min, max], + // so we are done + AB.setSignBit(); + } + // If the shift is exact, then the low bits are not dead + // (they must be zero). + if (cast<AShrOperator>(UserI)->isExact()) + AB |= APInt::getLowBitsSet(BitWidth, Max); } } break; diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 835e270..f33e04e 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -1531,6 +1531,62 @@ static APInt ceilingOfQuotient(const APInt &A, const APInt &B) { return Q; } +/// Given an affine expression of the form A*k + B, where k is an arbitrary +/// integer, infer the possible range of k based on the known range of the +/// affine expression. If we know A*k + B is non-negative, i.e., +/// +/// A*k + B >= 0 +/// +/// we can derive the following inequalities for k when A is positive: +/// +/// k >= -B / A +/// +/// Since k is an integer, it means k is greater than or equal to the +/// ceil(-B / A). +/// +/// If the upper bound of the affine expression \p UB is passed, the following +/// inequality can be derived as well: +/// +/// A*k + B <= UB +/// +/// which leads to: +/// +/// k <= (UB - B) / A +/// +/// Again, as k is an integer, it means k is less than or equal to the +/// floor((UB - B) / A). +/// +/// The similar logic applies when A is negative, but the inequalities sign flip +/// while working with them. +/// +/// Preconditions: \p A is non-zero, and we know A*k + B is non-negative. +static std::pair<std::optional<APInt>, std::optional<APInt>> +inferDomainOfAffine(const APInt &A, const APInt &B, + const std::optional<APInt> &UB) { + assert(A != 0 && "A must be non-zero"); + std::optional<APInt> TL, TU; + if (A.sgt(0)) { + TL = ceilingOfQuotient(-B, A); + LLVM_DEBUG(dbgs() << "\t Possible TL = " << *TL << "\n"); + // New bound check - modification to Banerjee's e3 check + if (UB) { + // TODO?: Overflow check for UB - B + TU = floorOfQuotient(*UB - B, A); + LLVM_DEBUG(dbgs() << "\t Possible TU = " << *TU << "\n"); + } + } else { + TU = floorOfQuotient(-B, A); + LLVM_DEBUG(dbgs() << "\t Possible TU = " << *TU << "\n"); + // New bound check - modification to Banerjee's e3 check + if (UB) { + // TODO?: Overflow check for UB - B + TL = ceilingOfQuotient(*UB - B, A); + LLVM_DEBUG(dbgs() << "\t Possible TL = " << *TL << "\n"); + } + } + return std::make_pair(TL, TU); +} + // exactSIVtest - // When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*i], // where i is an induction variable, c1 and c2 are loop invariant, and a1 @@ -1590,14 +1646,12 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff, LLVM_DEBUG(dbgs() << "\t X = " << X << ", Y = " << Y << "\n"); // since SCEV construction normalizes, LM = 0 - APInt UM(Bits, 1, true); - bool UMValid = false; + std::optional<APInt> UM; // UM is perhaps unavailable, let's check if (const SCEVConstant *CUB = collectConstantUpperBound(CurLoop, Delta->getType())) { UM = CUB->getAPInt(); - LLVM_DEBUG(dbgs() << "\t UM = " << UM << "\n"); - UMValid = true; + LLVM_DEBUG(dbgs() << "\t UM = " << *UM << "\n"); } APInt TU(APInt::getSignedMaxValue(Bits)); @@ -1609,44 +1663,33 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff, LLVM_DEBUG(dbgs() << "\t TX = " << TX << "\n"); LLVM_DEBUG(dbgs() << "\t TY = " << TY << "\n"); - SmallVector<APInt, 2> TLVec, TUVec; APInt TB = BM.sdiv(G); - if (TB.sgt(0)) { - TLVec.push_back(ceilingOfQuotient(-TX, TB)); - LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n"); - // New bound check - modification to Banerjee's e3 check - if (UMValid) { - TUVec.push_back(floorOfQuotient(UM - TX, TB)); - LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n"); - } - } else { - TUVec.push_back(floorOfQuotient(-TX, TB)); - LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n"); - // New bound check - modification to Banerjee's e3 check - if (UMValid) { - TLVec.push_back(ceilingOfQuotient(UM - TX, TB)); - LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n"); - } - } - APInt TA = AM.sdiv(G); - if (TA.sgt(0)) { - if (UMValid) { - TUVec.push_back(floorOfQuotient(UM - TY, TA)); - LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n"); - } - // New bound check - modification to Banerjee's e3 check - TLVec.push_back(ceilingOfQuotient(-TY, TA)); - LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n"); - } else { - if (UMValid) { - TLVec.push_back(ceilingOfQuotient(UM - TY, TA)); - LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n"); - } - // New bound check - modification to Banerjee's e3 check - TUVec.push_back(floorOfQuotient(-TY, TA)); - LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n"); - } + + // At this point, we have the following equations: + // + // TA*i0 - TB*i1 = TC + // + // Also, we know that the all pairs of (i0, i1) can be expressed as: + // + // (TX + k*TB, TY + k*TA) + // + // where k is an arbitrary integer. + auto [TL0, TU0] = inferDomainOfAffine(TB, TX, UM); + auto [TL1, TU1] = inferDomainOfAffine(TA, TY, UM); + + auto CreateVec = [](const std::optional<APInt> &V0, + const std::optional<APInt> &V1) { + SmallVector<APInt, 2> Vec; + if (V0) + Vec.push_back(*V0); + if (V1) + Vec.push_back(*V1); + return Vec; + }; + + SmallVector<APInt, 2> TLVec = CreateVec(TL0, TL1); + SmallVector<APInt, 2> TUVec = CreateVec(TU0, TU1); LLVM_DEBUG(dbgs() << "\t TA = " << TA << "\n"); LLVM_DEBUG(dbgs() << "\t TB = " << TB << "\n"); @@ -1967,24 +2010,20 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff, LLVM_DEBUG(dbgs() << "\t X = " << X << ", Y = " << Y << "\n"); // since SCEV construction seems to normalize, LM = 0 - APInt SrcUM(Bits, 1, true); - bool SrcUMvalid = false; + std::optional<APInt> SrcUM; // SrcUM is perhaps unavailable, let's check if (const SCEVConstant *UpperBound = collectConstantUpperBound(SrcLoop, Delta->getType())) { SrcUM = UpperBound->getAPInt(); - LLVM_DEBUG(dbgs() << "\t SrcUM = " << SrcUM << "\n"); - SrcUMvalid = true; + LLVM_DEBUG(dbgs() << "\t SrcUM = " << *SrcUM << "\n"); } - APInt DstUM(Bits, 1, true); - bool DstUMvalid = false; + std::optional<APInt> DstUM; // UM is perhaps unavailable, let's check if (const SCEVConstant *UpperBound = collectConstantUpperBound(DstLoop, Delta->getType())) { DstUM = UpperBound->getAPInt(); - LLVM_DEBUG(dbgs() << "\t DstUM = " << DstUM << "\n"); - DstUMvalid = true; + LLVM_DEBUG(dbgs() << "\t DstUM = " << *DstUM << "\n"); } APInt TU(APInt::getSignedMaxValue(Bits)); @@ -1996,47 +2035,39 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff, LLVM_DEBUG(dbgs() << "\t TX = " << TX << "\n"); LLVM_DEBUG(dbgs() << "\t TY = " << TY << "\n"); - SmallVector<APInt, 2> TLVec, TUVec; APInt TB = BM.sdiv(G); - if (TB.sgt(0)) { - TLVec.push_back(ceilingOfQuotient(-TX, TB)); - LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n"); - if (SrcUMvalid) { - TUVec.push_back(floorOfQuotient(SrcUM - TX, TB)); - LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n"); - } - } else { - TUVec.push_back(floorOfQuotient(-TX, TB)); - LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n"); - if (SrcUMvalid) { - TLVec.push_back(ceilingOfQuotient(SrcUM - TX, TB)); - LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n"); - } - } - APInt TA = AM.sdiv(G); - if (TA.sgt(0)) { - TLVec.push_back(ceilingOfQuotient(-TY, TA)); - LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n"); - if (DstUMvalid) { - TUVec.push_back(floorOfQuotient(DstUM - TY, TA)); - LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n"); - } - } else { - TUVec.push_back(floorOfQuotient(-TY, TA)); - LLVM_DEBUG(dbgs() << "\t Possible TU = " << TUVec.back() << "\n"); - if (DstUMvalid) { - TLVec.push_back(ceilingOfQuotient(DstUM - TY, TA)); - LLVM_DEBUG(dbgs() << "\t Possible TL = " << TLVec.back() << "\n"); - } - } - if (TLVec.empty() || TUVec.empty()) - return false; + // At this point, we have the following equations: + // + // TA*i - TB*j = TC + // + // Also, we know that the all pairs of (i, j) can be expressed as: + // + // (TX + k*TB, TY + k*TA) + // + // where k is an arbitrary integer. + auto [TL0, TU0] = inferDomainOfAffine(TB, TX, SrcUM); + auto [TL1, TU1] = inferDomainOfAffine(TA, TY, DstUM); LLVM_DEBUG(dbgs() << "\t TA = " << TA << "\n"); LLVM_DEBUG(dbgs() << "\t TB = " << TB << "\n"); + auto CreateVec = [](const std::optional<APInt> &V0, + const std::optional<APInt> &V1) { + SmallVector<APInt, 2> Vec; + if (V0) + Vec.push_back(*V0); + if (V1) + Vec.push_back(*V1); + return Vec; + }; + + SmallVector<APInt, 2> TLVec = CreateVec(TL0, TL1); + SmallVector<APInt, 2> TUVec = CreateVec(TU0, TU1); + if (TLVec.empty() || TUVec.empty()) + return false; + TL = APIntOps::smax(TLVec.front(), TLVec.back()); TU = APIntOps::smin(TUVec.front(), TUVec.back()); LLVM_DEBUG(dbgs() << "\t TL = " << TL << "\n"); @@ -2345,6 +2376,43 @@ static std::optional<APInt> getConstantPart(const SCEV *Expr) { return std::nullopt; } +bool DependenceInfo::accumulateCoefficientsGCD(const SCEV *Expr, + const Loop *CurLoop, + const SCEV *&CurLoopCoeff, + APInt &RunningGCD) const { + // If RunningGCD is already 1, exit early. + // TODO: It might be better to continue the recursion to find CurLoopCoeff. + if (RunningGCD == 1) + return true; + + const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr); + if (!AddRec) { + assert(isLoopInvariant(Expr, CurLoop) && + "Expected loop invariant expression"); + return true; + } + + assert(AddRec->isAffine() && "Unexpected Expr"); + const SCEV *Start = AddRec->getStart(); + const SCEV *Step = AddRec->getStepRecurrence(*SE); + if (AddRec->getLoop() == CurLoop) { + CurLoopCoeff = Step; + } else { + std::optional<APInt> ConstCoeff = getConstantPart(Step); + + // If the coefficient is the product of a constant and other stuff, we can + // use the constant in the GCD computation. + if (!ConstCoeff) + return false; + + // TODO: What happens if ConstCoeff is the "most negative" signed number + // (e.g. -128 for 8 bit wide APInt)? + RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs()); + } + + return accumulateCoefficientsGCD(Start, CurLoop, CurLoopCoeff, RunningGCD); +} + //===----------------------------------------------------------------------===// // gcdMIVtest - // Tests an MIV subscript pair for dependence. @@ -2464,40 +2532,11 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, RunningGCD = ExtraGCD; const SCEV *SrcCoeff = AddRec->getStepRecurrence(*SE); const SCEV *DstCoeff = SE->getMinusSCEV(SrcCoeff, SrcCoeff); - const SCEV *Inner = Src; - while (RunningGCD != 1 && isa<SCEVAddRecExpr>(Inner)) { - AddRec = cast<SCEVAddRecExpr>(Inner); - const SCEV *Coeff = AddRec->getStepRecurrence(*SE); - if (CurLoop == AddRec->getLoop()) - ; // SrcCoeff == Coeff - else { - // If the coefficient is the product of a constant and other stuff, - // we can use the constant in the GCD computation. - std::optional<APInt> ConstCoeff = getConstantPart(Coeff); - if (!ConstCoeff) - return false; - RunningGCD = - APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs()); - } - Inner = AddRec->getStart(); - } - Inner = Dst; - while (RunningGCD != 1 && isa<SCEVAddRecExpr>(Inner)) { - AddRec = cast<SCEVAddRecExpr>(Inner); - const SCEV *Coeff = AddRec->getStepRecurrence(*SE); - if (CurLoop == AddRec->getLoop()) - DstCoeff = Coeff; - else { - // If the coefficient is the product of a constant and other stuff, - // we can use the constant in the GCD computation. - std::optional<APInt> ConstCoeff = getConstantPart(Coeff); - if (!ConstCoeff) - return false; - RunningGCD = - APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs()); - } - Inner = AddRec->getStart(); - } + + if (!accumulateCoefficientsGCD(Src, CurLoop, SrcCoeff, RunningGCD) || + !accumulateCoefficientsGCD(Dst, CurLoop, DstCoeff, RunningGCD)) + return false; + Delta = SE->getMinusSCEV(SrcCoeff, DstCoeff); // If the coefficient is the product of a constant and other stuff, // we can use the constant in the GCD computation. diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 8be5de3..b8c540c 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -40,6 +40,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) { switch (Kind) { default: break; + case RecurKind::AddChainWithSubs: + case RecurKind::Sub: case RecurKind::Add: case RecurKind::Mul: case RecurKind::Or: @@ -897,8 +899,11 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( case Instruction::PHI: return InstDesc(I, Prev.getRecKind(), Prev.getExactFPMathInst()); case Instruction::Sub: + return InstDesc( + Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs, I); case Instruction::Add: - return InstDesc(Kind == RecurKind::Add, I); + return InstDesc( + Kind == RecurKind::Add || Kind == RecurKind::AddChainWithSubs, I); case Instruction::Mul: return InstDesc(Kind == RecurKind::Mul, I); case Instruction::And: @@ -917,7 +922,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( I->hasAllowReassoc() ? nullptr : I); case Instruction::Select: if (Kind == RecurKind::FAdd || Kind == RecurKind::FMul || - Kind == RecurKind::Add || Kind == RecurKind::Mul) + Kind == RecurKind::Add || Kind == RecurKind::Mul || + Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs) return isConditionalRdxPattern(I); if (isFindIVRecurrenceKind(Kind) && SE) return isFindIVPattern(Kind, L, OrigPhi, I, *SE); @@ -1003,6 +1009,17 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n"); return true; } + if (AddReductionVar(Phi, RecurKind::Sub, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { + LLVM_DEBUG(dbgs() << "Found a SUB reduction PHI." << *Phi << "\n"); + return true; + } + if (AddReductionVar(Phi, RecurKind::AddChainWithSubs, TheLoop, FMF, RedDes, + DB, AC, DT, SE)) { + LLVM_DEBUG(dbgs() << "Found a chained ADD-SUB reduction PHI." << *Phi + << "\n"); + return true; + } if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT, SE)) { LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n"); @@ -1201,6 +1218,9 @@ bool RecurrenceDescriptor::isFixedOrderRecurrence(PHINode *Phi, Loop *TheLoop, unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { switch (Kind) { + case RecurKind::Sub: + return Instruction::Sub; + case RecurKind::AddChainWithSubs: case RecurKind::Add: return Instruction::Add; case RecurKind::Mul: @@ -1288,6 +1308,10 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const { if (isFMulAddIntrinsic(Cur)) return true; + if (Cur->getOpcode() == Instruction::Sub && + Kind == RecurKind::AddChainWithSubs) + return true; + return Cur->getOpcode() == getOpcode(); }; diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index 22f4d08..757f689 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -180,6 +180,10 @@ static cl::opt<bool> DisableGEPConstOperand( "disable-gep-const-evaluation", cl::Hidden, cl::init(false), cl::desc("Disables evaluation of GetElementPtr with constant operands")); +static cl::opt<bool> InlineAllViableCalls( + "inline-all-viable-calls", cl::Hidden, cl::init(false), + cl::desc("Inline all viable calls, even if they exceed the inlining " + "threshold")); namespace llvm { std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) { if (Attr.isValid()) { @@ -3272,6 +3276,10 @@ InlineCost llvm::getInlineCost( return llvm::InlineCost::getNever(UserDecision->getFailureReason()); } + if (InlineAllViableCalls && isInlineViable(*Callee).isSuccess()) + return llvm::InlineCost::getAlways( + "Inlining forced by -inline-all-viable-calls"); + LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() << "... (caller:" << Call.getCaller()->getName() << ")\n"); diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 922f25d..c7b0ca9 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -927,8 +927,13 @@ LazyValueInfoImpl::solveBlockValueCast(CastInst *CI, BasicBlock *BB) { // NOTE: We're currently limited by the set of operations that ConstantRange // can evaluate symbolically. Enhancing that set will allows us to analyze // more definitions. - return ValueLatticeElement::getRange(LHSRange.castOp(CI->getOpcode(), - ResultBitWidth)); + ConstantRange Res = ConstantRange::getEmpty(ResultBitWidth); + if (auto *Trunc = dyn_cast<TruncInst>(CI)) + Res = LHSRange.truncate(ResultBitWidth, Trunc->getNoWrapKind()); + else + Res = LHSRange.castOp(CI->getOpcode(), ResultBitWidth); + + return ValueLatticeElement::getRange(Res); } std::optional<ValueLatticeElement> diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 78d0887..9a2c9ba 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -276,8 +276,7 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) { // this function is only used when one address use dominates the // other, which means that they'll always either have the same // value or one of them will have an undefined value. - if (isa<BinaryOperator>(A) || isa<CastInst>(A) || isa<PHINode>(A) || - isa<GetElementPtrInst>(A)) + if (isa<CastInst>(A) || isa<PHINode>(A) || isa<GetElementPtrInst>(A)) if (const Instruction *BI = dyn_cast<Instruction>(B)) if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI)) return true; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index a553533..bceddd0 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -936,6 +936,12 @@ private: static std::optional<int64_t> getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, Value *Ptr, PredicatedScalarEvolution &PSE) { + if (isa<ScalableVectorType>(AccessTy)) { + LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy + << "\n"); + return std::nullopt; + } + // The access function must stride over the innermost loop. if (Lp != AR->getLoop()) { LLVM_DEBUG({ @@ -1590,11 +1596,6 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, return 0; assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); - if (isa<ScalableVectorType>(AccessTy)) { - LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy - << "\n"); - return std::nullopt; - } const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); if (Assume && !AR) @@ -2404,12 +2405,13 @@ bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands, SmallVector<Instruction *, 4> MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const { MemAccessInfo Access(Ptr, IsWrite); - auto &IndexVector = Accesses.find(Access)->second; - + auto I = Accesses.find(Access); SmallVector<Instruction *, 4> Insts; - transform(IndexVector, - std::back_inserter(Insts), - [&](unsigned Idx) { return this->InstMap[Idx]; }); + if (I != Accesses.end()) { + transform(I->second, std::back_inserter(Insts), + [&](unsigned Idx) { return this->InstMap[Idx]; }); + } + return Insts; } diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp index 518a634..6ba6073 100644 --- a/llvm/lib/Analysis/LoopInfo.cpp +++ b/llvm/lib/Analysis/LoopInfo.cpp @@ -58,14 +58,26 @@ static cl::opt<bool, true> // Loop implementation // -bool Loop::isLoopInvariant(const Value *V) const { - if (const Instruction *I = dyn_cast<Instruction>(V)) - return !contains(I); +bool Loop::isLoopInvariant(const Value *V, bool HasCoroSuspendInst) const { + if (const Instruction *I = dyn_cast<Instruction>(V)) { + // FIXME: this is semantically inconsistent. We're tracking a proper fix in + // issue #149604. + // If V is a pointer to stack object and L contains a coro.suspend function + // call, then V may not be loop invariant because the ramp function and + // resume function have different stack frames. + if (HasCoroSuspendInst && isa<AllocaInst>(I)) + return false; + else + return !contains(I); + } return true; // All non-instructions are loop invariant } -bool Loop::hasLoopInvariantOperands(const Instruction *I) const { - return all_of(I->operands(), [this](Value *V) { return isLoopInvariant(V); }); +bool Loop::hasLoopInvariantOperands(const Instruction *I, + bool HasCoroSuspendInst) const { + return all_of(I->operands(), [&](Value *V) { + return isLoopInvariant(V, HasCoroSuspendInst); + }); } bool Loop::makeLoopInvariant(Value *V, bool &Changed, Instruction *InsertPt, diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index 2b0f212..67c2cfa 100644 --- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -150,6 +150,10 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc, switch (II->getIntrinsicID()) { case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: + Loc = MemoryLocation::getForArgument(II, 0, TLI); + // These intrinsics don't really modify the memory, but returning Mod + // will allow them to be handled conservatively. + return ModRefInfo::Mod; case Intrinsic::invariant_start: Loc = MemoryLocation::getForArgument(II, 1, TLI); // These intrinsics don't really modify the memory, but returning Mod @@ -441,11 +445,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( Intrinsic::ID ID = II->getIntrinsicID(); switch (ID) { case Intrinsic::lifetime_start: { - // FIXME: This only considers queries directly on the invariant-tagged - // pointer, not on query pointers that are indexed off of them. It'd - // be nice to handle that at some point (the right approach is to use - // GetPointerBaseWithConstantOffset). - MemoryLocation ArgLoc = MemoryLocation::getAfter(II->getArgOperand(1)); + MemoryLocation ArgLoc = MemoryLocation::getAfter(II->getArgOperand(0)); if (BatchAA.isMustAlias(ArgLoc, MemLoc)) return MemDepResult::getDef(II); continue; diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp index 28a2640..72b643c 100644 --- a/llvm/lib/Analysis/MemoryLocation.cpp +++ b/llvm/lib/Analysis/MemoryLocation.cpp @@ -191,7 +191,7 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: { - assert(ArgIdx == 1 && "Invalid argument index"); + assert(ArgIdx == 0 && "Invalid argument index"); auto *AI = dyn_cast<AllocaInst>(Arg); if (!AI) // lifetime of poison value. diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 477e477..d2c445f 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -7284,7 +7284,7 @@ ScalarEvolution::getDefiningScopeBound(ArrayRef<const SCEV *> Ops, bool &Precise) { Precise = true; // Do a bounded search of the def relation of the requested SCEVs. - SmallSet<const SCEV *, 16> Visited; + SmallPtrSet<const SCEV *, 16> Visited; SmallVector<const SCEV *> Worklist; auto pushOp = [&](const SCEV *S) { if (!Visited.insert(S).second) @@ -7435,7 +7435,15 @@ ScalarEvolution::getLoopProperties(const Loop *L) { if (auto *SI = dyn_cast<StoreInst>(I)) return !SI->isSimple(); - return I->mayThrow() || I->mayWriteToMemory(); + if (I->mayThrow()) + return true; + + // Non-volatile memset / memcpy do not count as side-effect for forward + // progress. + if (isa<MemIntrinsic>(I) && !I->isVolatile()) + return false; + + return I->mayWriteToMemory(); }; LoopProperties LP = {/* HasNoAbnormalExits */ true, @@ -14944,6 +14952,29 @@ const SCEVAddRecExpr *ScalarEvolution::convertSCEVToAddRecWithPredicates( if (!AddRec) return nullptr; + // Check if any of the transformed predicates is known to be false. In that + // case, it doesn't make sense to convert to a predicated AddRec, as the + // versioned loop will never execute. + for (const SCEVPredicate *Pred : TransformPreds) { + auto *WrapPred = dyn_cast<SCEVWrapPredicate>(Pred); + if (!WrapPred || WrapPred->getFlags() != SCEVWrapPredicate::IncrementNSSW) + continue; + + const SCEVAddRecExpr *AddRecToCheck = WrapPred->getExpr(); + const SCEV *ExitCount = getBackedgeTakenCount(AddRecToCheck->getLoop()); + if (isa<SCEVCouldNotCompute>(ExitCount)) + continue; + + const SCEV *Step = AddRecToCheck->getStepRecurrence(*this); + if (!Step->isOne()) + continue; + + ExitCount = getTruncateOrSignExtend(ExitCount, Step->getType()); + const SCEV *Add = getAddExpr(AddRecToCheck->getStart(), ExitCount); + if (isKnownPredicate(CmpInst::ICMP_SLT, Add, AddRecToCheck->getStart())) + return nullptr; + } + // Since the transformation was successful, we can now transfer the SCEV // predicates. Preds.append(TransformPreds.begin(), TransformPreds.end()); diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp index abe4985..1e20fca 100644 --- a/llvm/lib/Analysis/StackLifetime.cpp +++ b/llvm/lib/Analysis/StackLifetime.cpp @@ -70,7 +70,7 @@ void StackLifetime::collectMarkers() { const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); if (!II || !II->isLifetimeStartOrEnd()) continue; - const AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(1)); + const AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(0)); if (!AI) continue; auto It = AllocaNumbering.find(AI); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c7eb2ec..323ab8b 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1130,6 +1130,15 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val, return Cost; } +InstructionCost TargetTransformInfo::getIndexedVectorInstrCostFromEnd( + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, + unsigned Index) const { + InstructionCost Cost = + TTIImpl->getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, Index); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + InstructionCost TargetTransformInfo::getInsertExtractValueCost( unsigned Opcode, TTI::TargetCostKind CostKind) const { assert((Opcode == Instruction::InsertValue || @@ -1230,10 +1239,11 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const { return TTIImpl->getNumberOfParts(Tp); } -InstructionCost -TargetTransformInfo::getAddressComputationCost(Type *Tp, ScalarEvolution *SE, - const SCEV *Ptr) const { - InstructionCost Cost = TTIImpl->getAddressComputationCost(Tp, SE, Ptr); +InstructionCost TargetTransformInfo::getAddressComputationCost( + Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, + TTI::TargetCostKind CostKind) const { + InstructionCost Cost = + TTIImpl->getAddressComputationCost(PtrTy, SE, Ptr, CostKind); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 1e70228..21bdb2f 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -6356,27 +6356,6 @@ llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range, return nullptr; } -bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP, - unsigned CharSize) { - // Make sure the GEP has exactly three arguments. - if (GEP->getNumOperands() != 3) - return false; - - // Make sure the index-ee is a pointer to array of \p CharSize integers. - // CharSize. - ArrayType *AT = dyn_cast<ArrayType>(GEP->getSourceElementType()); - if (!AT || !AT->getElementType()->isIntegerTy(CharSize)) - return false; - - // Check to make sure that the first operand of the GEP is an integer and - // has value 0 so that we are sure we're indexing into the initializer. - const ConstantInt *FirstIdx = dyn_cast<ConstantInt>(GEP->getOperand(1)); - if (!FirstIdx || !FirstIdx->isZero()) - return false; - - return true; -} - // If V refers to an initialized global constant, set Slice either to // its initializer if the size of its elements equals ElementSize, or, // for ElementSize == 8, to its representation as an array of unsiged @@ -7415,8 +7394,10 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind, case Intrinsic::fshr: case Intrinsic::smax: case Intrinsic::smin: + case Intrinsic::scmp: case Intrinsic::umax: case Intrinsic::umin: + case Intrinsic::ucmp: case Intrinsic::ptrmask: case Intrinsic::fptoui_sat: case Intrinsic::fptosi_sat: @@ -7785,7 +7766,7 @@ bool llvm::mustExecuteUBIfPoisonOnPathTo(Instruction *Root, // The set of all recursive users we've visited (which are assumed to all be // poison because of said visit) - SmallSet<const Value *, 16> KnownPoison; + SmallPtrSet<const Value *, 16> KnownPoison; SmallVector<const Instruction*, 16> Worklist; Worklist.push_back(Root); while (!Worklist.empty()) { @@ -8140,8 +8121,8 @@ static bool programUndefinedIfUndefOrPoison(const Value *V, // Set of instructions that we have proved will yield poison if Inst // does. - SmallSet<const Value *, 16> YieldsPoison; - SmallSet<const BasicBlock *, 4> Visited; + SmallPtrSet<const Value *, 16> YieldsPoison; + SmallPtrSet<const BasicBlock *, 4> Visited; YieldsPoison.insert(V); Visited.insert(BB); @@ -9147,7 +9128,8 @@ static bool matchTwoInputRecurrence(const PHINode *PN, InstTy *&Inst, return false; for (unsigned I = 0; I != 2; ++I) { - if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I))) { + if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I)); + Operation && Operation->getNumOperands() >= 2) { Value *LHS = Operation->getOperand(0); Value *RHS = Operation->getOperand(1); if (LHS != PN && RHS != PN) diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 520c6a0..3d5bd61 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -928,6 +928,7 @@ lltok::Kind LLLexer::LexIdentifier() { INSTKEYWORD(fptoui, FPToUI); INSTKEYWORD(fptosi, FPToSI); INSTKEYWORD(inttoptr, IntToPtr); + INSTKEYWORD(ptrtoaddr, PtrToAddr); INSTKEYWORD(ptrtoint, PtrToInt); INSTKEYWORD(bitcast, BitCast); INSTKEYWORD(addrspacecast, AddrSpaceCast); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 13bef1f..1bc2906 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -4273,6 +4273,7 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) { case lltok::kw_bitcast: case lltok::kw_addrspacecast: case lltok::kw_inttoptr: + case lltok::kw_ptrtoaddr: case lltok::kw_ptrtoint: { unsigned Opc = Lex.getUIntVal(); Type *DestTy = nullptr; @@ -7310,6 +7311,7 @@ int LLParser::parseInstruction(Instruction *&Inst, BasicBlock *BB, case lltok::kw_fptoui: case lltok::kw_fptosi: case lltok::kw_inttoptr: + case lltok::kw_ptrtoaddr: case lltok::kw_ptrtoint: return parseCast(Inst, PFS, KeywordVal); case lltok::kw_fptrunc: diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp index eb83945..36d10d0 100644 --- a/llvm/lib/BinaryFormat/DXContainer.cpp +++ b/llvm/lib/BinaryFormat/DXContainer.cpp @@ -60,17 +60,6 @@ ArrayRef<EnumEntry<SigComponentType>> dxbc::getSigComponentTypes() { return ArrayRef(SigComponentTypes); } -static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = { - {"SRV", llvm::dxil::ResourceClass::SRV}, - {"UAV", llvm::dxil::ResourceClass::UAV}, - {"CBV", llvm::dxil::ResourceClass::CBuffer}, - {"Sampler", llvm::dxil::ResourceClass::Sampler}, -}; - -ArrayRef<EnumEntry<llvm::dxil::ResourceClass>> dxbc::getResourceClasses() { - return ArrayRef(ResourceClassNames); -} - static const EnumEntry<RootFlags> RootFlagNames[] = { #define ROOT_SIGNATURE_FLAG(Val, Enum) {#Enum, RootFlags::Enum}, #include "llvm/BinaryFormat/DXContainerConstants.def" diff --git a/llvm/lib/BinaryFormat/MsgPackDocument.cpp b/llvm/lib/BinaryFormat/MsgPackDocument.cpp index 11598ee..b52f029 100644 --- a/llvm/lib/BinaryFormat/MsgPackDocument.cpp +++ b/llvm/lib/BinaryFormat/MsgPackDocument.cpp @@ -104,6 +104,10 @@ DocNode &DocNode::operator=(uint64_t Val) { *this = getDocument()->getNode(Val); return *this; } +DocNode &DocNode::operator=(double Val) { + *this = getDocument()->getNode(Val); + return *this; +} // A level in the document reading stack. struct StackLevel { @@ -293,6 +297,9 @@ void Document::writeToBlob(std::string &Blob) { case Type::Binary: MPWriter.write(Node.getBinary()); break; + case Type::Float: + MPWriter.write(Node.getFloat()); + break; case Type::Empty: llvm_unreachable("unhandled empty msgpack node"); default: diff --git a/llvm/lib/BinaryFormat/SFrame.cpp b/llvm/lib/BinaryFormat/SFrame.cpp index f1765d7..8076a26 100644 --- a/llvm/lib/BinaryFormat/SFrame.cpp +++ b/llvm/lib/BinaryFormat/SFrame.cpp @@ -68,3 +68,11 @@ ArrayRef<EnumEntry<sframe::FREOffset>> sframe::getFREOffsets() { }; return ArrayRef(FREOffsets); } + +ArrayRef<EnumEntry<sframe::BaseReg>> sframe::getBaseRegisters() { + static constexpr EnumEntry<sframe::BaseReg> BaseRegs[] = { + {"FP", sframe::BaseReg::FP}, + {"SP", sframe::BaseReg::SP}, + }; + return ArrayRef(BaseRegs); +} diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 290d873..22a0d0f 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -1283,6 +1283,7 @@ static int getDecodedCastOpcode(unsigned Val) { case bitc::CAST_SITOFP : return Instruction::SIToFP; case bitc::CAST_FPTRUNC : return Instruction::FPTrunc; case bitc::CAST_FPEXT : return Instruction::FPExt; + case bitc::CAST_PTRTOADDR: return Instruction::PtrToAddr; case bitc::CAST_PTRTOINT: return Instruction::PtrToInt; case bitc::CAST_INTTOPTR: return Instruction::IntToPtr; case bitc::CAST_BITCAST : return Instruction::BitCast; diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 05680fa..a3f8254 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -647,6 +647,7 @@ static unsigned getEncodedCastOpcode(unsigned Opcode) { case Instruction::SIToFP : return bitc::CAST_SITOFP; case Instruction::FPTrunc : return bitc::CAST_FPTRUNC; case Instruction::FPExt : return bitc::CAST_FPEXT; + case Instruction::PtrToAddr: return bitc::CAST_PTRTOADDR; case Instruction::PtrToInt: return bitc::CAST_PTRTOINT; case Instruction::IntToPtr: return bitc::CAST_INTTOPTR; case Instruction::BitCast : return bitc::CAST_BITCAST; diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp new file mode 100644 index 0000000..73646ad --- /dev/null +++ b/llvm/lib/CAS/BuiltinCAS.cpp @@ -0,0 +1,94 @@ +//===- BuiltinCAS.cpp -------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/Support/Process.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +static StringRef getCASIDPrefix() { return "llvmcas://"; } +void BuiltinCASContext::anchor() {} + +Expected<HashType> BuiltinCASContext::parseID(StringRef Reference) { + if (!Reference.consume_front(getCASIDPrefix())) + return createStringError(std::make_error_code(std::errc::invalid_argument), + "invalid cas-id '" + Reference + "'"); + + // FIXME: Allow shortened references? + if (Reference.size() != 2 * sizeof(HashType)) + return createStringError(std::make_error_code(std::errc::invalid_argument), + "wrong size for cas-id hash '" + Reference + "'"); + + std::string Binary; + if (!tryGetFromHex(Reference, Binary)) + return createStringError(std::make_error_code(std::errc::invalid_argument), + "invalid hash in cas-id '" + Reference + "'"); + + assert(Binary.size() == sizeof(HashType)); + HashType Digest; + llvm::copy(Binary, Digest.data()); + return Digest; +} + +Expected<CASID> BuiltinCAS::parseID(StringRef Reference) { + Expected<HashType> Digest = BuiltinCASContext::parseID(Reference); + if (!Digest) + return Digest.takeError(); + + return CASID::create(&getContext(), toStringRef(*Digest)); +} + +void BuiltinCASContext::printID(ArrayRef<uint8_t> Digest, raw_ostream &OS) { + SmallString<64> Hash; + toHex(Digest, /*LowerCase=*/true, Hash); + OS << getCASIDPrefix() << Hash; +} + +void BuiltinCASContext::printIDImpl(raw_ostream &OS, const CASID &ID) const { + BuiltinCASContext::printID(ID.getHash(), OS); +} + +const BuiltinCASContext &BuiltinCASContext::getDefaultContext() { + static BuiltinCASContext DefaultContext; + return DefaultContext; +} + +Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) { + return storeImpl(BuiltinObjectHasher<HasherT>::hashObject(*this, Refs, Data), + Refs, Data); +} + +Error BuiltinCAS::validate(const CASID &ID) { + auto Ref = getReference(ID); + if (!Ref) + return createUnknownObjectError(ID); + + auto Handle = load(*Ref); + if (!Handle) + return Handle.takeError(); + + auto Proxy = ObjectProxy::load(*this, *Ref, *Handle); + SmallVector<ObjectRef> Refs; + if (auto E = Proxy.forEachReference([&](ObjectRef Ref) -> Error { + Refs.push_back(Ref); + return Error::success(); + })) + return E; + + ArrayRef<char> Data(Proxy.getData().data(), Proxy.getData().size()); + auto Hash = BuiltinObjectHasher<HasherT>::hashObject(*this, Refs, Data); + if (!ID.getHash().equals(Hash)) + return createCorruptObjectError(ID); + + return Error::success(); +} diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h new file mode 100644 index 0000000..3b5374d --- /dev/null +++ b/llvm/lib/CAS/BuiltinCAS.h @@ -0,0 +1,74 @@ +//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CAS_BUILTINCAS_H +#define LLVM_LIB_CAS_BUILTINCAS_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/BuiltinCASContext.h" +#include "llvm/CAS/ObjectStore.h" + +namespace llvm::cas { +class ActionCache; +namespace builtin { + +/// Common base class for builtin CAS implementations using the same CASContext. +class BuiltinCAS : public ObjectStore { +public: + BuiltinCAS() : ObjectStore(BuiltinCASContext::getDefaultContext()) {} + + Expected<CASID> parseID(StringRef Reference) final; + + Expected<ObjectRef> store(ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) final; + virtual Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) = 0; + + virtual Expected<ObjectRef> + storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash, + sys::fs::mapped_file_region Map) { + return storeImpl(ComputedHash, {}, ArrayRef(Map.data(), Map.size())); + } + + /// Both builtin CAS implementations provide lifetime for free, so this can + /// be const, and readData() and getDataSize() can be implemented on top of + /// it. + virtual ArrayRef<char> getDataConst(ObjectHandle Node) const = 0; + + ArrayRef<char> getData(ObjectHandle Node, + bool RequiresNullTerminator) const final { + // BuiltinCAS Objects are always null terminated. + return getDataConst(Node); + } + uint64_t getDataSize(ObjectHandle Node) const final { + return getDataConst(Node).size(); + } + + Error createUnknownObjectError(const CASID &ID) const { + return createStringError(std::make_error_code(std::errc::invalid_argument), + "unknown object '" + ID.toString() + "'"); + } + + Error createCorruptObjectError(const CASID &ID) const { + return createStringError(std::make_error_code(std::errc::invalid_argument), + "corrupt object '" + ID.toString() + "'"); + } + + Error createCorruptStorageError() const { + return createStringError(std::make_error_code(std::errc::invalid_argument), + "corrupt storage"); + } + + Error validate(const CASID &ID) final; +}; + +} // end namespace builtin +} // end namespace llvm::cas + +#endif // LLVM_LIB_CAS_BUILTINCAS_H diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt new file mode 100644 index 0000000..b2825a1 --- /dev/null +++ b/llvm/lib/CAS/CMakeLists.txt @@ -0,0 +1,11 @@ +add_llvm_component_library(LLVMCAS + BuiltinCAS.cpp + InMemoryCAS.cpp + ObjectStore.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS + + LINK_COMPONENTS + Support +) diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp new file mode 100644 index 0000000..255b89c --- /dev/null +++ b/llvm/lib/CAS/InMemoryCAS.cpp @@ -0,0 +1,326 @@ +//===- InMemoryCAS.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/ADT/LazyAtomicPointer.h" +#include "llvm/ADT/PointerIntPair.h" +#include "llvm/ADT/TrieRawHashMap.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ThreadSafeAllocator.h" +#include "llvm/Support/TrailingObjects.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +namespace { + +class InMemoryObject; + +/// Index of referenced IDs (map: Hash -> InMemoryObject*). Uses +/// LazyAtomicPointer to coordinate creation of objects. +using InMemoryIndexT = + ThreadSafeTrieRawHashMap<LazyAtomicPointer<const InMemoryObject>, + sizeof(HashType)>; + +/// Values in \a InMemoryIndexT. \a InMemoryObject's point at this to access +/// their hash. +using InMemoryIndexValueT = InMemoryIndexT::value_type; + +/// Builtin InMemory CAS that stores CAS object in the memory. +class InMemoryObject { +public: + enum class Kind { + /// Node with refs and data. + RefNode, + + /// Node with refs and data co-allocated. + InlineNode, + + Max = InlineNode, + }; + + Kind getKind() const { return IndexAndKind.getInt(); } + const InMemoryIndexValueT &getIndex() const { + assert(IndexAndKind.getPointer()); + return *IndexAndKind.getPointer(); + } + + ArrayRef<uint8_t> getHash() const { return getIndex().Hash; } + + InMemoryObject() = delete; + InMemoryObject(InMemoryObject &&) = delete; + InMemoryObject(const InMemoryObject &) = delete; + +protected: + InMemoryObject(Kind K, const InMemoryIndexValueT &I) : IndexAndKind(&I, K) {} + +private: + enum Counts : int { + NumKindBits = 2, + }; + PointerIntPair<const InMemoryIndexValueT *, NumKindBits, Kind> IndexAndKind; + static_assert((1U << NumKindBits) <= alignof(InMemoryIndexValueT), + "Kind will clobber pointer"); + static_assert(((int)Kind::Max >> NumKindBits) == 0, "Kind will be truncated"); + +public: + ArrayRef<char> getData() const; + + ArrayRef<const InMemoryObject *> getRefs() const; +}; + +class InMemoryRefObject final : public InMemoryObject { +public: + static constexpr Kind KindValue = Kind::RefNode; + static bool classof(const InMemoryObject *O) { + return O->getKind() == KindValue; + } + + ArrayRef<const InMemoryObject *> getRefsImpl() const { return Refs; } + ArrayRef<const InMemoryObject *> getRefs() const { return Refs; } + ArrayRef<char> getDataImpl() const { return Data; } + ArrayRef<char> getData() const { return Data; } + + static InMemoryRefObject &create(function_ref<void *(size_t Size)> Allocate, + const InMemoryIndexValueT &I, + ArrayRef<const InMemoryObject *> Refs, + ArrayRef<char> Data) { + void *Mem = Allocate(sizeof(InMemoryRefObject)); + return *new (Mem) InMemoryRefObject(I, Refs, Data); + } + +private: + InMemoryRefObject(const InMemoryIndexValueT &I, + ArrayRef<const InMemoryObject *> Refs, ArrayRef<char> Data) + : InMemoryObject(KindValue, I), Refs(Refs), Data(Data) { + assert(isAddrAligned(Align(8), this) && "Expected 8-byte alignment"); + assert(isAddrAligned(Align(8), Data.data()) && "Expected 8-byte alignment"); + assert(*Data.end() == 0 && "Expected null-termination"); + } + + ArrayRef<const InMemoryObject *> Refs; + ArrayRef<char> Data; +}; + +class InMemoryInlineObject final + : public InMemoryObject, + public TrailingObjects<InMemoryInlineObject, const InMemoryObject *, + char> { +public: + static constexpr Kind KindValue = Kind::InlineNode; + static bool classof(const InMemoryObject *O) { + return O->getKind() == KindValue; + } + + ArrayRef<const InMemoryObject *> getRefs() const { return getRefsImpl(); } + ArrayRef<const InMemoryObject *> getRefsImpl() const { + return ArrayRef(getTrailingObjects<const InMemoryObject *>(), NumRefs); + } + + ArrayRef<char> getData() const { return getDataImpl(); } + ArrayRef<char> getDataImpl() const { + return ArrayRef(getTrailingObjects<char>(), DataSize); + } + + static InMemoryInlineObject & + create(function_ref<void *(size_t Size)> Allocate, + const InMemoryIndexValueT &I, ArrayRef<const InMemoryObject *> Refs, + ArrayRef<char> Data) { + void *Mem = Allocate(sizeof(InMemoryInlineObject) + + sizeof(uintptr_t) * Refs.size() + Data.size() + 1); + return *new (Mem) InMemoryInlineObject(I, Refs, Data); + } + + size_t numTrailingObjects(OverloadToken<const InMemoryObject *>) const { + return NumRefs; + } + +private: + InMemoryInlineObject(const InMemoryIndexValueT &I, + ArrayRef<const InMemoryObject *> Refs, + ArrayRef<char> Data) + : InMemoryObject(KindValue, I), NumRefs(Refs.size()), + DataSize(Data.size()) { + auto *BeginRefs = reinterpret_cast<const InMemoryObject **>(this + 1); + llvm::copy(Refs, BeginRefs); + auto *BeginData = reinterpret_cast<char *>(BeginRefs + NumRefs); + llvm::copy(Data, BeginData); + BeginData[Data.size()] = 0; + } + uint32_t NumRefs; + uint32_t DataSize; +}; + +/// In-memory CAS database and action cache (the latter should be separated). +class InMemoryCAS : public BuiltinCAS { +public: + Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) final; + + Expected<ObjectRef> + storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash, + sys::fs::mapped_file_region Map) override; + + CASID getID(const InMemoryIndexValueT &I) const { + StringRef Hash = toStringRef(I.Hash); + return CASID::create(&getContext(), Hash); + } + CASID getID(const InMemoryObject &O) const { return getID(O.getIndex()); } + + ObjectHandle getObjectHandle(const InMemoryObject &Node) const { + assert(!(reinterpret_cast<uintptr_t>(&Node) & 0x1ULL)); + return makeObjectHandle(reinterpret_cast<uintptr_t>(&Node)); + } + + Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) override { + return getObjectHandle(asInMemoryObject(Ref)); + } + + InMemoryIndexValueT &indexHash(ArrayRef<uint8_t> Hash) { + return *Index.insertLazy( + Hash, [](auto ValueConstructor) { ValueConstructor.emplace(nullptr); }); + } + + /// TODO: Consider callers to actually do an insert and to return a handle to + /// the slot in the trie. + const InMemoryObject *getInMemoryObject(CASID ID) const { + assert(ID.getContext().getHashSchemaIdentifier() == + getContext().getHashSchemaIdentifier() && + "Expected ID from same hash schema"); + if (InMemoryIndexT::const_pointer P = Index.find(ID.getHash())) + return P->Data; + return nullptr; + } + + const InMemoryObject &getInMemoryObject(ObjectHandle OH) const { + return *reinterpret_cast<const InMemoryObject *>( + (uintptr_t)OH.getInternalRef(*this)); + } + + const InMemoryObject &asInMemoryObject(ReferenceBase Ref) const { + uintptr_t P = Ref.getInternalRef(*this); + return *reinterpret_cast<const InMemoryObject *>(P); + } + ObjectRef toReference(const InMemoryObject &O) const { + return makeObjectRef(reinterpret_cast<uintptr_t>(&O)); + } + + CASID getID(ObjectRef Ref) const final { return getIDImpl(Ref); } + CASID getIDImpl(ReferenceBase Ref) const { + return getID(asInMemoryObject(Ref)); + } + + std::optional<ObjectRef> getReference(const CASID &ID) const final { + if (const InMemoryObject *Object = getInMemoryObject(ID)) + return toReference(*Object); + return std::nullopt; + } + + Expected<bool> isMaterialized(ObjectRef Ref) const final { return true; } + + ArrayRef<char> getDataConst(ObjectHandle Node) const final { + return cast<InMemoryObject>(asInMemoryObject(Node)).getData(); + } + + InMemoryCAS() = default; + +private: + size_t getNumRefs(ObjectHandle Node) const final { + return getInMemoryObject(Node).getRefs().size(); + } + ObjectRef readRef(ObjectHandle Node, size_t I) const final { + return toReference(*getInMemoryObject(Node).getRefs()[I]); + } + Error forEachRef(ObjectHandle Node, + function_ref<Error(ObjectRef)> Callback) const final; + + /// Index of referenced IDs (map: Hash -> InMemoryObject*). Mapped to nullptr + /// as a convenient way to store hashes. + /// + /// - Insert nullptr on lookups. + /// - InMemoryObject points back to here. + InMemoryIndexT Index; + + ThreadSafeAllocator<BumpPtrAllocator> Objects; + ThreadSafeAllocator<SpecificBumpPtrAllocator<sys::fs::mapped_file_region>> + MemoryMaps; +}; + +} // end anonymous namespace + +ArrayRef<char> InMemoryObject::getData() const { + if (auto *Derived = dyn_cast<InMemoryRefObject>(this)) + return Derived->getDataImpl(); + return cast<InMemoryInlineObject>(this)->getDataImpl(); +} + +ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const { + if (auto *Derived = dyn_cast<InMemoryRefObject>(this)) + return Derived->getRefsImpl(); + return cast<InMemoryInlineObject>(this)->getRefsImpl(); +} + +Expected<ObjectRef> +InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash, + sys::fs::mapped_file_region Map) { + // Look up the hash in the index, initializing to nullptr if it's new. + ArrayRef<char> Data(Map.data(), Map.size()); + auto &I = indexHash(ComputedHash); + + // Load or generate. + auto Allocator = [&](size_t Size) -> void * { + return Objects.Allocate(Size, alignof(InMemoryObject)); + }; + auto Generator = [&]() -> const InMemoryObject * { + return &InMemoryRefObject::create(Allocator, I, {}, Data); + }; + const InMemoryObject &Node = + cast<InMemoryObject>(I.Data.loadOrGenerate(Generator)); + + // Save Map if the winning node uses it. + if (auto *RefNode = dyn_cast<InMemoryRefObject>(&Node)) + if (RefNode->getData().data() == Map.data()) + new (MemoryMaps.Allocate(1)) sys::fs::mapped_file_region(std::move(Map)); + + return toReference(Node); +} + +Expected<ObjectRef> InMemoryCAS::storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) { + // Look up the hash in the index, initializing to nullptr if it's new. + auto &I = indexHash(ComputedHash); + + // Create the node. + SmallVector<const InMemoryObject *> InternalRefs; + for (ObjectRef Ref : Refs) + InternalRefs.push_back(&asInMemoryObject(Ref)); + auto Allocator = [&](size_t Size) -> void * { + return Objects.Allocate(Size, alignof(InMemoryObject)); + }; + auto Generator = [&]() -> const InMemoryObject * { + return &InMemoryInlineObject::create(Allocator, I, InternalRefs, Data); + }; + return toReference(cast<InMemoryObject>(I.Data.loadOrGenerate(Generator))); +} + +Error InMemoryCAS::forEachRef(ObjectHandle Handle, + function_ref<Error(ObjectRef)> Callback) const { + auto &Node = getInMemoryObject(Handle); + for (const InMemoryObject *Ref : Node.getRefs()) + if (Error E = Callback(toReference(*Ref))) + return E; + return Error::success(); +} + +std::unique_ptr<ObjectStore> cas::createInMemoryCAS() { + return std::make_unique<InMemoryCAS>(); +} diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp new file mode 100644 index 0000000..e0be50b --- /dev/null +++ b/llvm/lib/CAS/ObjectStore.cpp @@ -0,0 +1,162 @@ +//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/ObjectStore.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include <optional> + +using namespace llvm; +using namespace llvm::cas; + +void CASContext::anchor() {} +void ObjectStore::anchor() {} + +LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); } + +std::string CASID::toString() const { + std::string S; + raw_string_ostream(S) << *this; + return S; +} + +static void printReferenceBase(raw_ostream &OS, StringRef Kind, + uint64_t InternalRef, std::optional<CASID> ID) { + OS << Kind << "=" << InternalRef; + if (ID) + OS << "[" << *ID << "]"; +} + +void ReferenceBase::print(raw_ostream &OS, const ObjectHandle &This) const { + assert(this == &This); + printReferenceBase(OS, "object-handle", InternalRef, std::nullopt); +} + +void ReferenceBase::print(raw_ostream &OS, const ObjectRef &This) const { + assert(this == &This); + + std::optional<CASID> ID; +#if LLVM_ENABLE_ABI_BREAKING_CHECKS + if (CAS) + ID = CAS->getID(This); +#endif + printReferenceBase(OS, "object-ref", InternalRef, ID); +} + +Expected<ObjectHandle> ObjectStore::load(ObjectRef Ref) { + std::optional<ObjectHandle> Handle; + if (Error E = loadIfExists(Ref).moveInto(Handle)) + return std::move(E); + if (!Handle) + return createStringError(errc::invalid_argument, + "missing object '" + getID(Ref).toString() + "'"); + return *Handle; +} + +std::unique_ptr<MemoryBuffer> +ObjectStore::getMemoryBuffer(ObjectHandle Node, StringRef Name, + bool RequiresNullTerminator) { + return MemoryBuffer::getMemBuffer( + toStringRef(getData(Node, RequiresNullTerminator)), Name, + RequiresNullTerminator); +} + +void ObjectStore::readRefs(ObjectHandle Node, + SmallVectorImpl<ObjectRef> &Refs) const { + consumeError(forEachRef(Node, [&Refs](ObjectRef Ref) -> Error { + Refs.push_back(Ref); + return Error::success(); + })); +} + +Expected<ObjectProxy> ObjectStore::getProxy(const CASID &ID) { + std::optional<ObjectRef> Ref = getReference(ID); + if (!Ref) + return createUnknownObjectError(ID); + + return getProxy(*Ref); +} + +Expected<ObjectProxy> ObjectStore::getProxy(ObjectRef Ref) { + std::optional<ObjectHandle> H; + if (Error E = load(Ref).moveInto(H)) + return std::move(E); + + return ObjectProxy::load(*this, Ref, *H); +} + +Expected<std::optional<ObjectProxy>> +ObjectStore::getProxyIfExists(ObjectRef Ref) { + std::optional<ObjectHandle> H; + if (Error E = loadIfExists(Ref).moveInto(H)) + return std::move(E); + if (!H) + return std::nullopt; + return ObjectProxy::load(*this, Ref, *H); +} + +Error ObjectStore::createUnknownObjectError(const CASID &ID) { + return createStringError(std::make_error_code(std::errc::invalid_argument), + "unknown object '" + ID.toString() + "'"); +} + +Expected<ObjectProxy> ObjectStore::createProxy(ArrayRef<ObjectRef> Refs, + StringRef Data) { + Expected<ObjectRef> Ref = store(Refs, arrayRefFromStringRef<char>(Data)); + if (!Ref) + return Ref.takeError(); + return getProxy(*Ref); +} + +Expected<ObjectRef> +ObjectStore::storeFromOpenFileImpl(sys::fs::file_t FD, + std::optional<sys::fs::file_status> Status) { + // TODO: For the on-disk CAS implementation use cloning to store it as a + // standalone file if the file-system supports it and the file is large. + uint64_t Size = Status ? Status->getSize() : -1; + auto Buffer = MemoryBuffer::getOpenFile(FD, /*Filename=*/"", Size); + if (!Buffer) + return errorCodeToError(Buffer.getError()); + + return store({}, arrayRefFromStringRef<char>((*Buffer)->getBuffer())); +} + +Error ObjectStore::validateTree(ObjectRef Root) { + SmallDenseSet<ObjectRef> ValidatedRefs; + SmallVector<ObjectRef, 16> RefsToValidate; + RefsToValidate.push_back(Root); + + while (!RefsToValidate.empty()) { + ObjectRef Ref = RefsToValidate.pop_back_val(); + auto [I, Inserted] = ValidatedRefs.insert(Ref); + if (!Inserted) + continue; // already validated. + if (Error E = validate(getID(Ref))) + return E; + Expected<ObjectHandle> Obj = load(Ref); + if (!Obj) + return Obj.takeError(); + if (Error E = forEachRef(*Obj, [&RefsToValidate](ObjectRef R) -> Error { + RefsToValidate.push_back(R); + return Error::success(); + })) + return E; + } + return Error::success(); +} + +std::unique_ptr<MemoryBuffer> +ObjectProxy::getMemoryBuffer(StringRef Name, + bool RequiresNullTerminator) const { + return CAS->getMemoryBuffer(H, Name, RequiresNullTerminator); +} diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt index a561830..a943297 100644 --- a/llvm/lib/CMakeLists.txt +++ b/llvm/lib/CMakeLists.txt @@ -9,6 +9,7 @@ add_subdirectory(FileCheck) add_subdirectory(InterfaceStub) add_subdirectory(IRPrinter) add_subdirectory(IRReader) +add_subdirectory(CAS) add_subdirectory(CGData) add_subdirectory(CodeGen) add_subdirectory(CodeGenTypes) diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp index e7b9417..2ef96cc 100644 --- a/llvm/lib/CodeGen/Analysis.cpp +++ b/llvm/lib/CodeGen/Analysis.cpp @@ -69,18 +69,10 @@ unsigned llvm::ComputeLinearIndex(Type *Ty, return CurIndex + 1; } -/// ComputeValueVTs - Given an LLVM IR type, compute a sequence of -/// EVTs that represent all the individual underlying -/// non-aggregate types that comprise it. -/// -/// If Offsets is non-null, it points to a vector to be filled in -/// with the in-memory offsets of each of the individual values. -/// -void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, - Type *Ty, SmallVectorImpl<EVT> &ValueVTs, - SmallVectorImpl<EVT> *MemVTs, - SmallVectorImpl<TypeSize> *Offsets, - TypeSize StartingOffset) { +void llvm::ComputeValueTypes(const DataLayout &DL, Type *Ty, + SmallVectorImpl<Type *> &Types, + SmallVectorImpl<TypeSize> *Offsets, + TypeSize StartingOffset) { assert((Ty->isScalableTy() == StartingOffset.isScalable() || StartingOffset.isZero()) && "Offset/TypeSize mismatch!"); @@ -90,15 +82,13 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, // us to support structs with scalable vectors for operations that don't // need offsets. const StructLayout *SL = Offsets ? DL.getStructLayout(STy) : nullptr; - for (StructType::element_iterator EB = STy->element_begin(), - EI = EB, + for (StructType::element_iterator EB = STy->element_begin(), EI = EB, EE = STy->element_end(); EI != EE; ++EI) { // Don't compute the element offset if we didn't get a StructLayout above. TypeSize EltOffset = SL ? SL->getElementOffset(EI - EB) : TypeSize::getZero(); - ComputeValueVTs(TLI, DL, *EI, ValueVTs, MemVTs, Offsets, - StartingOffset + EltOffset); + ComputeValueTypes(DL, *EI, Types, Offsets, StartingOffset + EltOffset); } return; } @@ -107,21 +97,39 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *EltTy = ATy->getElementType(); TypeSize EltSize = DL.getTypeAllocSize(EltTy); for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i) - ComputeValueVTs(TLI, DL, EltTy, ValueVTs, MemVTs, Offsets, - StartingOffset + i * EltSize); + ComputeValueTypes(DL, EltTy, Types, Offsets, + StartingOffset + i * EltSize); return; } // Interpret void as zero return values. if (Ty->isVoidTy()) return; - // Base case: we can get an EVT for this LLVM IR type. - ValueVTs.push_back(TLI.getValueType(DL, Ty)); - if (MemVTs) - MemVTs->push_back(TLI.getMemValueType(DL, Ty)); + Types.push_back(Ty); if (Offsets) Offsets->push_back(StartingOffset); } +/// ComputeValueVTs - Given an LLVM IR type, compute a sequence of +/// EVTs that represent all the individual underlying +/// non-aggregate types that comprise it. +/// +/// If Offsets is non-null, it points to a vector to be filled in +/// with the in-memory offsets of each of the individual values. +/// +void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, + Type *Ty, SmallVectorImpl<EVT> &ValueVTs, + SmallVectorImpl<EVT> *MemVTs, + SmallVectorImpl<TypeSize> *Offsets, + TypeSize StartingOffset) { + SmallVector<Type *> Types; + ComputeValueTypes(DL, Ty, Types, Offsets, StartingOffset); + for (Type *Ty : Types) { + ValueVTs.push_back(TLI.getValueType(DL, Ty)); + if (MemVTs) + MemVTs->push_back(TLI.getMemValueType(DL, Ty)); + } +} + void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl<EVT> &ValueVTs, SmallVectorImpl<EVT> *MemVTs, diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index c72b6e8..23a3543 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -3657,6 +3657,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV, break; // Error } + case Instruction::PtrToAddr: case Instruction::PtrToInt: { const DataLayout &DL = getDataLayout(); diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 3f3d5dc9..278dd65 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -1915,7 +1915,6 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( // TODO: the "order" argument type is "int", not int32. So // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints. - ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size); assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO"); Constant *OrderingVal = ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering)); @@ -2012,7 +2011,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( if (CASExpected) { AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType()); AllocaCASExpected->setAlignment(AllocaAlignment); - Builder.CreateLifetimeStart(AllocaCASExpected, SizeVal64); + Builder.CreateLifetimeStart(AllocaCASExpected); Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment); Args.push_back(AllocaCASExpected); } @@ -2026,7 +2025,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( } else { AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType()); AllocaValue->setAlignment(AllocaAlignment); - Builder.CreateLifetimeStart(AllocaValue, SizeVal64); + Builder.CreateLifetimeStart(AllocaValue); Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment); Args.push_back(AllocaValue); } @@ -2036,7 +2035,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( if (!CASExpected && HasResult && !UseSizedLibcall) { AllocaResult = AllocaBuilder.CreateAlloca(I->getType()); AllocaResult->setAlignment(AllocaAlignment); - Builder.CreateLifetimeStart(AllocaResult, SizeVal64); + Builder.CreateLifetimeStart(AllocaResult); Args.push_back(AllocaResult); } @@ -2069,7 +2068,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( // And then, extract the results... if (ValueOperand && !UseSizedLibcall) - Builder.CreateLifetimeEnd(AllocaValue, SizeVal64); + Builder.CreateLifetimeEnd(AllocaValue); if (CASExpected) { // The final result from the CAS is {load of 'expected' alloca, bool result @@ -2078,7 +2077,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( Value *V = PoisonValue::get(FinalResultTy); Value *ExpectedOut = Builder.CreateAlignedLoad( CASExpected->getType(), AllocaCASExpected, AllocaAlignment); - Builder.CreateLifetimeEnd(AllocaCASExpected, SizeVal64); + Builder.CreateLifetimeEnd(AllocaCASExpected); V = Builder.CreateInsertValue(V, ExpectedOut, 0); V = Builder.CreateInsertValue(V, Result, 1); I->replaceAllUsesWith(V); @@ -2089,7 +2088,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( else { V = Builder.CreateAlignedLoad(I->getType(), AllocaResult, AllocaAlignment); - Builder.CreateLifetimeEnd(AllocaResult, SizeVal64); + Builder.CreateLifetimeEnd(AllocaResult); } I->replaceAllUsesWith(V); } diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index dcfd9aa..7292bc2 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -1787,10 +1787,18 @@ ReoptimizeBlock: // below were performed for EH "FallThrough" blocks. Therefore, even if // that appears not to be happening anymore, we should assume that it is // possible and not remove the "!FallThrough()->isEHPad" condition below. + // + // Similarly, the analyzeBranch call does not consider callbr, which also + // introduces the possibility of infinite rotation, as there may be + // multiple successors of PrevBB. Thus we check such case by + // FallThrough->isInlineAsmBrIndirectTarget(). + // NOTE: Checking if PrevBB contains callbr is more precise, but much + // more expensive. MachineBasicBlock *PrevTBB = nullptr, *PrevFBB = nullptr; SmallVector<MachineOperand, 4> PrevCond; - if (FallThrough != MF.end() && - !FallThrough->isEHPad() && + + if (FallThrough != MF.end() && !FallThrough->isEHPad() && + !FallThrough->isInlineAsmBrIndirectTarget() && !TII->analyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) && PrevBB.isSuccessor(&*FallThrough)) { MBB->moveAfter(&MF.back()); diff --git a/llvm/lib/CodeGen/CallingConvLower.cpp b/llvm/lib/CodeGen/CallingConvLower.cpp index b71e781..df34331 100644 --- a/llvm/lib/CodeGen/CallingConvLower.cpp +++ b/llvm/lib/CodeGen/CallingConvLower.cpp @@ -89,7 +89,7 @@ CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins, for (unsigned i = 0; i != NumArgs; ++i) { MVT ArgVT = Ins[i].VT; ISD::ArgFlagsTy ArgFlags = Ins[i].Flags; - if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) + if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, Ins[i].OrigTy, *this)) report_fatal_error("unable to allocate function argument #" + Twine(i)); } } @@ -102,7 +102,7 @@ bool CCState::CheckReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, for (unsigned i = 0, e = Outs.size(); i != e; ++i) { MVT VT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; - if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this)) + if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, Outs[i].OrigTy, *this)) return false; } return true; @@ -116,7 +116,7 @@ void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, for (unsigned i = 0, e = Outs.size(); i != e; ++i) { MVT VT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; - if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this)) + if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, Outs[i].OrigTy, *this)) report_fatal_error("unable to allocate function return #" + Twine(i)); } } @@ -129,7 +129,8 @@ void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, for (unsigned i = 0; i != NumOps; ++i) { MVT ArgVT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; - if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) { + if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, Outs[i].OrigTy, + *this)) { #ifndef NDEBUG dbgs() << "Call operand #" << i << " has unhandled type " << ArgVT << '\n'; @@ -142,12 +143,13 @@ void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, /// Same as above except it takes vectors of types and argument flags. void CCState::AnalyzeCallOperands(SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &Flags, + SmallVectorImpl<Type *> &OrigTys, CCAssignFn Fn) { unsigned NumOps = ArgVTs.size(); for (unsigned i = 0; i != NumOps; ++i) { MVT ArgVT = ArgVTs[i]; ISD::ArgFlagsTy ArgFlags = Flags[i]; - if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) { + if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, OrigTys[i], *this)) { #ifndef NDEBUG dbgs() << "Call operand #" << i << " has unhandled type " << ArgVT << '\n'; @@ -164,7 +166,7 @@ void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins, for (unsigned i = 0, e = Ins.size(); i != e; ++i) { MVT VT = Ins[i].VT; ISD::ArgFlagsTy Flags = Ins[i].Flags; - if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this)) { + if (Fn(i, VT, VT, CCValAssign::Full, Flags, Ins[i].OrigTy, *this)) { #ifndef NDEBUG dbgs() << "Call result #" << i << " has unhandled type " << VT << '\n'; @@ -175,8 +177,8 @@ void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins, } /// Same as above except it's specialized for calls that produce a single value. -void CCState::AnalyzeCallResult(MVT VT, CCAssignFn Fn) { - if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this)) { +void CCState::AnalyzeCallResult(MVT VT, Type *OrigTy, CCAssignFn Fn) { + if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), OrigTy, *this)) { #ifndef NDEBUG dbgs() << "Call result has unhandled type " << VT << '\n'; @@ -213,7 +215,8 @@ void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCRegister> &Regs, // location in memory. bool HaveRegParm; do { - if (Fn(0, VT, VT, CCValAssign::Full, Flags, *this)) { + Type *OrigTy = EVT(VT).getTypeForEVT(Context); + if (Fn(0, VT, VT, CCValAssign::Full, Flags, OrigTy, *this)) { #ifndef NDEBUG dbgs() << "Call has unhandled type " << VT << " while computing remaining regparms\n"; diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 9223739..0e40a92 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -377,7 +377,7 @@ public: /// to be optimized again. /// Note: Consider building time in this pass, when a BB updated, we need /// to insert such BB into FreshBBs for huge function. - SmallSet<BasicBlock *, 32> FreshBBs; + SmallPtrSet<BasicBlock *, 32> FreshBBs; void releaseMemory() { // Clear per function information. @@ -1105,7 +1105,7 @@ bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB, /// Replace all old uses with new ones, and push the updated BBs into FreshBBs. static void replaceAllUsesWith(Value *Old, Value *New, - SmallSet<BasicBlock *, 32> &FreshBBs, + SmallPtrSet<BasicBlock *, 32> &FreshBBs, bool IsHuge) { auto *OldI = dyn_cast<Instruction>(Old); if (OldI) { @@ -2135,7 +2135,7 @@ static bool isRemOfLoopIncrementWithLoopInvariant( // Rem = rem == RemAmtLoopInvariant ? 0 : Rem; static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL, const LoopInfo *LI, - SmallSet<BasicBlock *, 32> &FreshBBs, + SmallPtrSet<BasicBlock *, 32> &FreshBBs, bool IsHuge) { Value *AddOffset, *RemAmt, *AddInst; PHINode *LoopIncrPN; @@ -2534,11 +2534,10 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, /// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ] /// /// If the transform is performed, return true and set ModifiedDT to true. -static bool despeculateCountZeros(IntrinsicInst *CountZeros, - LoopInfo &LI, +static bool despeculateCountZeros(IntrinsicInst *CountZeros, LoopInfo &LI, const TargetLowering *TLI, const DataLayout *DL, ModifyDT &ModifiedDT, - SmallSet<BasicBlock *, 32> &FreshBBs, + SmallPtrSet<BasicBlock *, 32> &FreshBBs, bool IsHugeFunc) { // If a zero input is undefined, it doesn't make sense to despeculate that. if (match(CountZeros->getOperand(1), m_One())) @@ -4351,7 +4350,7 @@ private: PhiNodeSet &PhiNodesToMatch) { SmallVector<PHIPair, 8> WorkList; Matcher.insert({PHI, Candidate}); - SmallSet<PHINode *, 8> MatchedPHIs; + SmallPtrSet<PHINode *, 8> MatchedPHIs; MatchedPHIs.insert(PHI); WorkList.push_back({PHI, Candidate}); SmallSet<PHIPair, 8> Visited; @@ -8635,7 +8634,7 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI, } static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI, - SmallSet<BasicBlock *, 32> &FreshBBs, + SmallPtrSet<BasicBlock *, 32> &FreshBBs, bool IsHugeFunc) { // Try and convert // %c = icmp ult %x, 8 diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 9b2851e..cd21e25 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -67,6 +67,7 @@ #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" @@ -108,6 +109,42 @@ static bool isNeg(Value *V); static Value *getNegOperand(Value *V); namespace { +struct ComplexValue { + Value *Real = nullptr; + Value *Imag = nullptr; + + bool operator==(const ComplexValue &Other) const { + return Real == Other.Real && Imag == Other.Imag; + } +}; +hash_code hash_value(const ComplexValue &Arg) { + return hash_combine(DenseMapInfo<Value *>::getHashValue(Arg.Real), + DenseMapInfo<Value *>::getHashValue(Arg.Imag)); +} +} // end namespace +typedef SmallVector<struct ComplexValue, 2> ComplexValues; + +namespace llvm { +template <> struct DenseMapInfo<ComplexValue> { + static inline ComplexValue getEmptyKey() { + return {DenseMapInfo<Value *>::getEmptyKey(), + DenseMapInfo<Value *>::getEmptyKey()}; + } + static inline ComplexValue getTombstoneKey() { + return {DenseMapInfo<Value *>::getTombstoneKey(), + DenseMapInfo<Value *>::getTombstoneKey()}; + } + static unsigned getHashValue(const ComplexValue &Val) { + return hash_combine(DenseMapInfo<Value *>::getHashValue(Val.Real), + DenseMapInfo<Value *>::getHashValue(Val.Imag)); + } + static bool isEqual(const ComplexValue &LHS, const ComplexValue &RHS) { + return LHS.Real == RHS.Real && LHS.Imag == RHS.Imag; + } +}; +} // end namespace llvm + +namespace { template <typename T, typename IterT> std::optional<T> findCommonBetweenCollections(IterT A, IterT B) { auto Common = llvm::find_if(A, [B](T I) { return llvm::is_contained(B, I); }); @@ -145,7 +182,13 @@ struct ComplexDeinterleavingCompositeNode { ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op, Value *R, Value *I) - : Operation(Op), Real(R), Imag(I) {} + : Operation(Op) { + Vals.push_back({R, I}); + } + + ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op, + ComplexValues &Other) + : Operation(Op), Vals(Other) {} private: friend class ComplexDeinterleavingGraph; @@ -155,8 +198,7 @@ private: public: ComplexDeinterleavingOperation Operation; - Value *Real; - Value *Imag; + ComplexValues Vals; // This two members are required exclusively for generating // ComplexDeinterleavingOperation::Symmetric operations. @@ -192,10 +234,12 @@ public: }; OS << "- CompositeNode: " << this << "\n"; - OS << " Real: "; - PrintValue(Real); - OS << " Imag: "; - PrintValue(Imag); + for (unsigned I = 0; I < Vals.size(); I++) { + OS << " Real(" << I << ") : "; + PrintValue(Vals[I].Real); + OS << " Imag(" << I << ") : "; + PrintValue(Vals[I].Imag); + } OS << " ReplacementNode: "; PrintValue(ReplacementNode); OS << " Operation: " << (int)Operation << "\n"; @@ -233,14 +277,16 @@ public: }; explicit ComplexDeinterleavingGraph(const TargetLowering *TL, - const TargetLibraryInfo *TLI) - : TL(TL), TLI(TLI) {} + const TargetLibraryInfo *TLI, + unsigned Factor) + : TL(TL), TLI(TLI), Factor(Factor) {} private: const TargetLowering *TL = nullptr; const TargetLibraryInfo *TLI = nullptr; + unsigned Factor; SmallVector<NodePtr> CompositeNodes; - DenseMap<std::pair<Value *, Value *>, NodePtr> CachedResult; + DenseMap<ComplexValues, NodePtr> CachedResult; SmallPtrSet<Instruction *, 16> FinalInstructions; @@ -305,10 +351,25 @@ private: I); } + NodePtr prepareCompositeNode(ComplexDeinterleavingOperation Operation, + ComplexValues &Vals) { +#ifndef NDEBUG + for (auto &V : Vals) { + assert( + ((Operation != ComplexDeinterleavingOperation::ReductionPHI && + Operation != ComplexDeinterleavingOperation::ReductionOperation) || + (V.Real && V.Imag)) && + "Reduction related nodes must have Real and Imaginary parts"); + } +#endif + return std::make_shared<ComplexDeinterleavingCompositeNode>(Operation, + Vals); + } + NodePtr submitCompositeNode(NodePtr Node) { CompositeNodes.push_back(Node); - if (Node->Real) - CachedResult[{Node->Real, Node->Imag}] = Node; + if (Node->Vals[0].Real) + CachedResult[Node->Vals] = Node; return Node; } @@ -340,11 +401,17 @@ private: /// 270: r: ar + bi /// i: ai - br NodePtr identifyAdd(Instruction *Real, Instruction *Imag); - NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag); + NodePtr identifySymmetricOperation(ComplexValues &Vals); NodePtr identifyPartialReduction(Value *R, Value *I); NodePtr identifyDotProduct(Value *Inst); - NodePtr identifyNode(Value *R, Value *I); + NodePtr identifyNode(ComplexValues &Vals); + + NodePtr identifyNode(Value *R, Value *I) { + ComplexValues Vals; + Vals.push_back({R, I}); + return identifyNode(Vals); + } /// Determine if a sum of complex numbers can be formed from \p RealAddends /// and \p ImagAddens. If \p Accumulator is not null, add the result to it. @@ -388,15 +455,16 @@ private: /// operation: /// * Using two shufflevectors with even indices for /pReal instruction and /// odd indices for /pImag instructions (only for fixed-width vectors) - /// * Using two extractvalue instructions applied to `vector.deinterleave2` - /// intrinsic (for both fixed and scalable vectors) - NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag); + /// * Using N extractvalue instructions applied to `vector.deinterleaveN` + /// intrinsics (for both fixed and scalable vectors) where N is a multiple of + /// 2. + NodePtr identifyDeinterleave(ComplexValues &Vals); /// identifying the operation that represents a complex number repeated in a /// Splat vector. There are two possible types of splats: ConstantExpr with /// the opcode ShuffleVector and ShuffleVectorInstr. Both should have an /// initialization mask with all values set to zero. - NodePtr identifySplat(Value *Real, Value *Imag); + NodePtr identifySplat(ComplexValues &Vals); NodePtr identifyPHINode(Instruction *Real, Instruction *Imag); @@ -447,7 +515,7 @@ public: bool runOnFunction(Function &F); private: - bool evaluateBasicBlock(BasicBlock *B); + bool evaluateBasicBlock(BasicBlock *B, unsigned Factor); const TargetLowering *TL = nullptr; const TargetLibraryInfo *TLI = nullptr; @@ -500,7 +568,15 @@ bool ComplexDeinterleaving::runOnFunction(Function &F) { bool Changed = false; for (auto &B : F) - Changed |= evaluateBasicBlock(&B); + Changed |= evaluateBasicBlock(&B, 2); + + // TODO: Permit changes for both interleave factors in the same function. + if (!Changed) { + for (auto &B : F) + Changed |= evaluateBasicBlock(&B, 4); + } + + // TODO: We can also support interleave factors of 6 and 8 if needed. return Changed; } @@ -545,8 +621,8 @@ Value *getNegOperand(Value *V) { return I->getOperand(1); } -bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) { - ComplexDeinterleavingGraph Graph(TL, TLI); +bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B, unsigned Factor) { + ComplexDeinterleavingGraph Graph(TL, TLI, Factor); if (Graph.collectPotentialReductions(B)) Graph.identifyReductionNodes(); @@ -669,6 +745,7 @@ ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real, Instruction *Imag) { LLVM_DEBUG(dbgs() << "identifyPartialMul " << *Real << " / " << *Imag << "\n"); + // Determine rotation auto IsAdd = [](unsigned Op) { return Op == Instruction::FAdd || Op == Instruction::Add; @@ -865,43 +942,57 @@ static bool isInstructionPotentiallySymmetric(Instruction *I) { } ComplexDeinterleavingGraph::NodePtr -ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real, - Instruction *Imag) { - if (Real->getOpcode() != Imag->getOpcode()) - return nullptr; +ComplexDeinterleavingGraph::identifySymmetricOperation(ComplexValues &Vals) { + auto *FirstReal = cast<Instruction>(Vals[0].Real); + unsigned FirstOpc = FirstReal->getOpcode(); + for (auto &V : Vals) { + auto *Real = cast<Instruction>(V.Real); + auto *Imag = cast<Instruction>(V.Imag); + if (Real->getOpcode() != FirstOpc || Imag->getOpcode() != FirstOpc) + return nullptr; - if (!isInstructionPotentiallySymmetric(Real) || - !isInstructionPotentiallySymmetric(Imag)) - return nullptr; + if (!isInstructionPotentiallySymmetric(Real) || + !isInstructionPotentiallySymmetric(Imag)) + return nullptr; - auto *R0 = Real->getOperand(0); - auto *I0 = Imag->getOperand(0); + if (isa<FPMathOperator>(FirstReal)) + if (Real->getFastMathFlags() != FirstReal->getFastMathFlags() || + Imag->getFastMathFlags() != FirstReal->getFastMathFlags()) + return nullptr; + } - NodePtr Op0 = identifyNode(R0, I0); + ComplexValues OpVals; + for (auto &V : Vals) { + auto *R0 = cast<Instruction>(V.Real)->getOperand(0); + auto *I0 = cast<Instruction>(V.Imag)->getOperand(0); + OpVals.push_back({R0, I0}); + } + + NodePtr Op0 = identifyNode(OpVals); NodePtr Op1 = nullptr; if (Op0 == nullptr) return nullptr; - if (Real->isBinaryOp()) { - auto *R1 = Real->getOperand(1); - auto *I1 = Imag->getOperand(1); - Op1 = identifyNode(R1, I1); + if (FirstReal->isBinaryOp()) { + OpVals.clear(); + for (auto &V : Vals) { + auto *R1 = cast<Instruction>(V.Real)->getOperand(1); + auto *I1 = cast<Instruction>(V.Imag)->getOperand(1); + OpVals.push_back({R1, I1}); + } + Op1 = identifyNode(OpVals); if (Op1 == nullptr) return nullptr; } - if (isa<FPMathOperator>(Real) && - Real->getFastMathFlags() != Imag->getFastMathFlags()) - return nullptr; - - auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric, - Real, Imag); - Node->Opcode = Real->getOpcode(); - if (isa<FPMathOperator>(Real)) - Node->Flags = Real->getFastMathFlags(); + auto Node = + prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric, Vals); + Node->Opcode = FirstReal->getOpcode(); + if (isa<FPMathOperator>(FirstReal)) + Node->Flags = FirstReal->getFastMathFlags(); Node->addOperand(Op0); - if (Real->isBinaryOp()) + if (FirstReal->isBinaryOp()) Node->addOperand(Op1); return submitCompositeNode(Node); @@ -909,7 +1000,6 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real, ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyDotProduct(Value *V) { - if (!TL->isComplexDeinterleavingOperationSupported( ComplexDeinterleavingOperation::CDot, V->getType())) { LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving " @@ -1054,65 +1144,77 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) { } ComplexDeinterleavingGraph::NodePtr -ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) { - auto It = CachedResult.find({R, I}); +ComplexDeinterleavingGraph::identifyNode(ComplexValues &Vals) { + auto It = CachedResult.find(Vals); if (It != CachedResult.end()) { LLVM_DEBUG(dbgs() << " - Folding to existing node\n"); return It->second; } - if (NodePtr CN = identifyPartialReduction(R, I)) - return CN; - - bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I); - if (!IsReduction && R->getType() != I->getType()) - return nullptr; + if (Vals.size() == 1) { + assert(Factor == 2 && "Can only handle interleave factors of 2"); + Value *R = Vals[0].Real; + Value *I = Vals[0].Imag; + if (NodePtr CN = identifyPartialReduction(R, I)) + return CN; + bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I); + if (!IsReduction && R->getType() != I->getType()) + return nullptr; + } - if (NodePtr CN = identifySplat(R, I)) + if (NodePtr CN = identifySplat(Vals)) return CN; - auto *Real = dyn_cast<Instruction>(R); - auto *Imag = dyn_cast<Instruction>(I); - if (!Real || !Imag) - return nullptr; + for (auto &V : Vals) { + auto *Real = dyn_cast<Instruction>(V.Real); + auto *Imag = dyn_cast<Instruction>(V.Imag); + if (!Real || !Imag) + return nullptr; + } - if (NodePtr CN = identifyDeinterleave(Real, Imag)) + if (NodePtr CN = identifyDeinterleave(Vals)) return CN; - if (NodePtr CN = identifyPHINode(Real, Imag)) - return CN; + if (Vals.size() == 1) { + assert(Factor == 2 && "Can only handle interleave factors of 2"); + auto *Real = dyn_cast<Instruction>(Vals[0].Real); + auto *Imag = dyn_cast<Instruction>(Vals[0].Imag); + if (NodePtr CN = identifyPHINode(Real, Imag)) + return CN; - if (NodePtr CN = identifySelectNode(Real, Imag)) - return CN; + if (NodePtr CN = identifySelectNode(Real, Imag)) + return CN; - auto *VTy = cast<VectorType>(Real->getType()); - auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); + auto *VTy = cast<VectorType>(Real->getType()); + auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); - bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported( - ComplexDeinterleavingOperation::CMulPartial, NewVTy); - bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported( - ComplexDeinterleavingOperation::CAdd, NewVTy); + bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CMulPartial, NewVTy); + bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CAdd, NewVTy); - if (HasCMulSupport && isInstructionPairMul(Real, Imag)) { - if (NodePtr CN = identifyPartialMul(Real, Imag)) - return CN; - } + if (HasCMulSupport && isInstructionPairMul(Real, Imag)) { + if (NodePtr CN = identifyPartialMul(Real, Imag)) + return CN; + } - if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) { - if (NodePtr CN = identifyAdd(Real, Imag)) - return CN; - } + if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) { + if (NodePtr CN = identifyAdd(Real, Imag)) + return CN; + } - if (HasCMulSupport && HasCAddSupport) { - if (NodePtr CN = identifyReassocNodes(Real, Imag)) - return CN; + if (HasCMulSupport && HasCAddSupport) { + if (NodePtr CN = identifyReassocNodes(Real, Imag)) { + return CN; + } + } } - if (NodePtr CN = identifySymmetricOperation(Real, Imag)) + if (NodePtr CN = identifySymmetricOperation(Vals)) return CN; LLVM_DEBUG(dbgs() << " - Not recognised as a valid pattern.\n"); - CachedResult[{R, I}] = nullptr; + CachedResult[Vals] = nullptr; return nullptr; } @@ -1256,9 +1358,10 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real, return nullptr; } assert(FinalNode && "FinalNode can not be nullptr here"); + assert(FinalNode->Vals.size() == 1); // Set the Real and Imag fields of the final node and submit it - FinalNode->Real = Real; - FinalNode->Imag = Imag; + FinalNode->Vals[0].Real = Real; + FinalNode->Vals[0].Imag = Imag; submitCompositeNode(FinalNode); return FinalNode; } @@ -1381,7 +1484,7 @@ ComplexDeinterleavingGraph::identifyMultiplications( auto NodeA = It->second; auto NodeB = PMI.Node; - auto IsMultiplicandReal = PMI.Common == NodeA->Real; + auto IsMultiplicandReal = PMI.Common == NodeA->Vals[0].Real; // The following table illustrates the relationship between multiplications // and rotations. If we consider the multiplication (X + iY) * (U + iV), we // can see: @@ -1423,10 +1526,10 @@ ComplexDeinterleavingGraph::identifyMultiplications( LLVM_DEBUG({ dbgs() << "Identified partial multiplication (X, Y) * (U, V):\n"; - dbgs().indent(4) << "X: " << *NodeA->Real << "\n"; - dbgs().indent(4) << "Y: " << *NodeA->Imag << "\n"; - dbgs().indent(4) << "U: " << *NodeB->Real << "\n"; - dbgs().indent(4) << "V: " << *NodeB->Imag << "\n"; + dbgs().indent(4) << "X: " << *NodeA->Vals[0].Real << "\n"; + dbgs().indent(4) << "Y: " << *NodeA->Vals[0].Imag << "\n"; + dbgs().indent(4) << "U: " << *NodeB->Vals[0].Real << "\n"; + dbgs().indent(4) << "V: " << *NodeB->Vals[0].Imag << "\n"; dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n"; }); @@ -1595,10 +1698,13 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { ComplexDeinterleavingOperation::ReductionOperation || RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle); + assert(RootNode->Vals.size() == 1 && + "Cannot handle reductions involving multiple complex values"); // Find out which part, Real or Imag, comes later, and only if we come to // the latest part, add it to OrderedRoots. - auto *R = cast<Instruction>(RootNode->Real); - auto *I = RootNode->Imag ? cast<Instruction>(RootNode->Imag) : nullptr; + auto *R = cast<Instruction>(RootNode->Vals[0].Real); + auto *I = RootNode->Vals[0].Imag ? cast<Instruction>(RootNode->Vals[0].Imag) + : nullptr; Instruction *ReplacementAnchor; if (I) @@ -1631,6 +1737,8 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) { bool FoundPotentialReduction = false; + if (Factor != 2) + return false; auto *Br = dyn_cast<BranchInst>(B->getTerminator()); if (!Br || Br->getNumSuccessors() != 2) @@ -1682,6 +1790,8 @@ bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) { } void ComplexDeinterleavingGraph::identifyReductionNodes() { + assert(Factor == 2 && "Cannot handle multiple complex values"); + SmallVector<bool> Processed(ReductionInfo.size(), false); SmallVector<Instruction *> OperationInstruction; for (auto &P : ReductionInfo) @@ -1771,11 +1881,11 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { } bool ComplexDeinterleavingGraph::checkNodes() { - bool FoundDeinterleaveNode = false; for (NodePtr N : CompositeNodes) { if (!N->areOperandsValid()) return false; + if (N->Operation == ComplexDeinterleavingOperation::Deinterleave) FoundDeinterleaveNode = true; } @@ -1861,17 +1971,33 @@ bool ComplexDeinterleavingGraph::checkNodes() { ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) { if (auto *Intrinsic = dyn_cast<IntrinsicInst>(RootI)) { - if (Intrinsic->getIntrinsicID() != Intrinsic::vector_interleave2) + if (Intrinsic::getInterleaveIntrinsicID(Factor) != + Intrinsic->getIntrinsicID()) return nullptr; - auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(0)); - auto *Imag = dyn_cast<Instruction>(Intrinsic->getOperand(1)); - if (!Real || !Imag) - return nullptr; + ComplexValues Vals; + for (unsigned I = 0; I < Factor; I += 2) { + auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(I)); + auto *Imag = dyn_cast<Instruction>(Intrinsic->getOperand(I + 1)); + if (!Real || !Imag) + return nullptr; + Vals.push_back({Real, Imag}); + } - return identifyNode(Real, Imag); + ComplexDeinterleavingGraph::NodePtr Node1 = identifyNode(Vals); + if (!Node1) + return nullptr; + return Node1; } + // TODO: We could also add support for fixed-width interleave factors of 4 + // and above, but currently for symmetric operations the interleaves and + // deinterleaves are already removed by VectorCombine. If we extend this to + // permit complex multiplications, reductions, etc. then we should also add + // support for fixed-width here. + if (Factor != 2) + return nullptr; + auto *SVI = dyn_cast<ShuffleVectorInst>(RootI); if (!SVI) return nullptr; @@ -1890,22 +2016,52 @@ ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) { } ComplexDeinterleavingGraph::NodePtr -ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real, - Instruction *Imag) { - Instruction *I = nullptr; - Value *FinalValue = nullptr; - if (match(Real, m_ExtractValue<0>(m_Instruction(I))) && - match(Imag, m_ExtractValue<1>(m_Specific(I))) && - match(I, m_Intrinsic<Intrinsic::vector_deinterleave2>( - m_Value(FinalValue)))) { +ComplexDeinterleavingGraph::identifyDeinterleave(ComplexValues &Vals) { + Instruction *II = nullptr; + + // Must be at least one complex value. + auto CheckExtract = [&](Value *V, unsigned ExpectedIdx, + Instruction *ExpectedInsn) -> ExtractValueInst * { + auto *EVI = dyn_cast<ExtractValueInst>(V); + if (!EVI || EVI->getNumIndices() != 1 || + EVI->getIndices()[0] != ExpectedIdx || + !isa<Instruction>(EVI->getAggregateOperand()) || + (ExpectedInsn && ExpectedInsn != EVI->getAggregateOperand())) + return nullptr; + return EVI; + }; + + for (unsigned Idx = 0; Idx < Vals.size(); Idx++) { + ExtractValueInst *RealEVI = CheckExtract(Vals[Idx].Real, Idx * 2, II); + if (RealEVI && Idx == 0) + II = cast<Instruction>(RealEVI->getAggregateOperand()); + if (!RealEVI || !CheckExtract(Vals[Idx].Imag, (Idx * 2) + 1, II)) { + II = nullptr; + break; + } + } + + if (auto *IntrinsicII = dyn_cast_or_null<IntrinsicInst>(II)) { + if (IntrinsicII->getIntrinsicID() != + Intrinsic::getDeinterleaveIntrinsicID(2 * Vals.size())) + return nullptr; + + // The remaining should match too. NodePtr PlaceholderNode = prepareCompositeNode( - llvm::ComplexDeinterleavingOperation::Deinterleave, Real, Imag); - PlaceholderNode->ReplacementNode = FinalValue; - FinalInstructions.insert(Real); - FinalInstructions.insert(Imag); + llvm::ComplexDeinterleavingOperation::Deinterleave, Vals); + PlaceholderNode->ReplacementNode = II->getOperand(0); + for (auto &V : Vals) { + FinalInstructions.insert(cast<Instruction>(V.Real)); + FinalInstructions.insert(cast<Instruction>(V.Imag)); + } return submitCompositeNode(PlaceholderNode); } + if (Vals.size() != 1) + return nullptr; + + Value *Real = Vals[0].Real; + Value *Imag = Vals[0].Imag; auto *RealShuffle = dyn_cast<ShuffleVectorInst>(Real); auto *ImagShuffle = dyn_cast<ShuffleVectorInst>(Imag); if (!RealShuffle || !ImagShuffle) { @@ -1999,7 +2155,7 @@ ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real, } ComplexDeinterleavingGraph::NodePtr -ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) { +ComplexDeinterleavingGraph::identifySplat(ComplexValues &Vals) { auto IsSplat = [](Value *V) -> bool { // Fixed-width vector with constants if (isa<ConstantDataVector>(V)) @@ -2033,24 +2189,39 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) { return all_equal(Mask) && Mask[0] == 0; }; - if (!IsSplat(R) || !IsSplat(I)) - return nullptr; - - auto *Real = dyn_cast<Instruction>(R); - auto *Imag = dyn_cast<Instruction>(I); - if ((!Real && Imag) || (Real && !Imag)) - return nullptr; + // The splats must meet the following requirements: + // 1. Must either be all instructions or all values. + // 2. Non-constant splats must live in the same block. + if (auto *FirstValAsInstruction = dyn_cast<Instruction>(Vals[0].Real)) { + BasicBlock *FirstBB = FirstValAsInstruction->getParent(); + for (auto &V : Vals) { + if (!IsSplat(V.Real) || !IsSplat(V.Imag)) + return nullptr; - if (Real && Imag) { - // Non-constant splats should be in the same basic block - if (Real->getParent() != Imag->getParent()) - return nullptr; + auto *Real = dyn_cast<Instruction>(V.Real); + auto *Imag = dyn_cast<Instruction>(V.Imag); + if (!Real || !Imag || Real->getParent() != FirstBB || + Imag->getParent() != FirstBB) + return nullptr; + } + } else { + for (auto &V : Vals) { + if (!IsSplat(V.Real) || !IsSplat(V.Imag) || isa<Instruction>(V.Real) || + isa<Instruction>(V.Imag)) + return nullptr; + } + } - FinalInstructions.insert(Real); - FinalInstructions.insert(Imag); + for (auto &V : Vals) { + auto *Real = dyn_cast<Instruction>(V.Real); + auto *Imag = dyn_cast<Instruction>(V.Imag); + if (Real && Imag) { + FinalInstructions.insert(Real); + FinalInstructions.insert(Imag); + } } NodePtr PlaceholderNode = - prepareCompositeNode(ComplexDeinterleavingOperation::Splat, R, I); + prepareCompositeNode(ComplexDeinterleavingOperation::Splat, Vals); return submitCompositeNode(PlaceholderNode); } @@ -2186,24 +2357,35 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, llvm_unreachable("Deinterleave node should already have ReplacementNode"); break; case ComplexDeinterleavingOperation::Splat: { - auto *R = dyn_cast<Instruction>(Node->Real); - auto *I = dyn_cast<Instruction>(Node->Imag); + SmallVector<Value *> Ops; + for (auto &V : Node->Vals) { + Ops.push_back(V.Real); + Ops.push_back(V.Imag); + } + auto *R = dyn_cast<Instruction>(Node->Vals[0].Real); + auto *I = dyn_cast<Instruction>(Node->Vals[0].Imag); if (R && I) { // Splats that are not constant are interleaved where they are located - Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode(); + Instruction *InsertPoint = R; + for (auto V : Node->Vals) { + if (InsertPoint->comesBefore(cast<Instruction>(V.Real))) + InsertPoint = cast<Instruction>(V.Real); + if (InsertPoint->comesBefore(cast<Instruction>(V.Imag))) + InsertPoint = cast<Instruction>(V.Imag); + } + InsertPoint = InsertPoint->getNextNode(); IRBuilder<> IRB(InsertPoint); - ReplacementNode = IRB.CreateVectorInterleave({Node->Real, Node->Imag}); + ReplacementNode = IRB.CreateVectorInterleave(Ops); } else { - ReplacementNode = - Builder.CreateVectorInterleave({Node->Real, Node->Imag}); + ReplacementNode = Builder.CreateVectorInterleave(Ops); } break; } case ComplexDeinterleavingOperation::ReductionPHI: { // If Operation is ReductionPHI, a new empty PHINode is created. // It is filled later when the ReductionOperation is processed. - auto *OldPHI = cast<PHINode>(Node->Real); - auto *VTy = cast<VectorType>(Node->Real->getType()); + auto *OldPHI = cast<PHINode>(Node->Vals[0].Real); + auto *VTy = cast<VectorType>(Node->Vals[0].Real->getType()); auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHIIt()); OldToNewPHI[OldPHI] = NewPHI; @@ -2219,8 +2401,8 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, processReductionOperation(ReplacementNode, Node); break; case ComplexDeinterleavingOperation::ReductionSelect: { - auto *MaskReal = cast<Instruction>(Node->Real)->getOperand(0); - auto *MaskImag = cast<Instruction>(Node->Imag)->getOperand(0); + auto *MaskReal = cast<Instruction>(Node->Vals[0].Real)->getOperand(0); + auto *MaskImag = cast<Instruction>(Node->Vals[0].Imag)->getOperand(0); auto *A = replaceNode(Builder, Node->Operands[0]); auto *B = replaceNode(Builder, Node->Operands[1]); auto *NewMask = Builder.CreateVectorInterleave({MaskReal, MaskImag}); @@ -2237,7 +2419,7 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, void ComplexDeinterleavingGraph::processReductionSingle( Value *OperationReplacement, RawNodePtr Node) { - auto *Real = cast<Instruction>(Node->Real); + auto *Real = cast<Instruction>(Node->Vals[0].Real); auto *OldPHI = ReductionInfo[Real].first; auto *NewPHI = OldToNewPHI[OldPHI]; auto *VTy = cast<VectorType>(Real->getType()); @@ -2269,8 +2451,8 @@ void ComplexDeinterleavingGraph::processReductionSingle( void ComplexDeinterleavingGraph::processReductionOperation( Value *OperationReplacement, RawNodePtr Node) { - auto *Real = cast<Instruction>(Node->Real); - auto *Imag = cast<Instruction>(Node->Imag); + auto *Real = cast<Instruction>(Node->Vals[0].Real); + auto *Imag = cast<Instruction>(Node->Vals[0].Imag); auto *OldPHIReal = ReductionInfo[Real].first; auto *OldPHIImag = ReductionInfo[Imag].first; auto *NewPHI = OldToNewPHI[OldPHIReal]; @@ -2318,15 +2500,15 @@ void ComplexDeinterleavingGraph::replaceNodes() { if (RootNode->Operation == ComplexDeinterleavingOperation::ReductionOperation) { - auto *RootReal = cast<Instruction>(RootNode->Real); - auto *RootImag = cast<Instruction>(RootNode->Imag); + auto *RootReal = cast<Instruction>(RootNode->Vals[0].Real); + auto *RootImag = cast<Instruction>(RootNode->Vals[0].Imag); ReductionInfo[RootReal].first->removeIncomingValue(BackEdge); ReductionInfo[RootImag].first->removeIncomingValue(BackEdge); DeadInstrRoots.push_back(RootReal); DeadInstrRoots.push_back(RootImag); } else if (RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle) { - auto *RootInst = cast<Instruction>(RootNode->Real); + auto *RootInst = cast<Instruction>(RootNode->Vals[0].Real); auto &Info = ReductionInfo[RootInst]; Info.first->removeIncomingValue(BackEdge); DeadInstrRoots.push_back(Info.second); diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp index d8e3f5f..753c656 100644 --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -508,8 +508,7 @@ bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { IRBuilder<> Builder(VPI.getParent(), VPI.getIterator()); Value *FactorConst = Builder.getInt32(StaticElemCount.getKnownMinValue()); Value *VScale = Builder.CreateVScale(Int32Ty, "vscale"); - MaxEVL = Builder.CreateMul(VScale, FactorConst, "scalable_size", - /*NUW*/ true, /*NSW*/ false); + MaxEVL = Builder.CreateNUWMul(VScale, FactorConst, "scalable_size"); } else { MaxEVL = ConstantInt::get(Int32Ty, StaticElemCount.getFixedValue(), false); } diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 0f3ec8b..90a18b86 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -1099,7 +1099,7 @@ bool CallLowering::checkReturn(CCState &CCInfo, CCAssignFn *Fn) const { for (unsigned I = 0, E = Outs.size(); I < E; ++I) { MVT VT = MVT::getVT(Outs[I].Ty); - if (Fn(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo)) + if (Fn(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], Outs[I].Ty, CCInfo)) return false; } return true; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index e84ba91..8163dea 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1821,10 +1821,29 @@ bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI, return false; } + // Reassociating nuw additions preserves nuw. If both original G_PTR_ADDs are + // inbounds, reaching the same result in one G_PTR_ADD is also inbounds. + // The nusw constraints are satisfied because imm1+imm2 cannot exceed the + // largest signed integer that fits into the index type, which is the maximum + // size of allocated objects according to the IR Language Reference. + unsigned PtrAddFlags = MI.getFlags(); + unsigned LHSPtrAddFlags = Add2Def->getFlags(); + bool IsNoUWrap = PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::NoUWrap; + bool IsInBounds = + PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::InBounds; + unsigned Flags = 0; + if (IsNoUWrap) + Flags |= MachineInstr::MIFlag::NoUWrap; + if (IsInBounds) { + Flags |= MachineInstr::MIFlag::InBounds; + Flags |= MachineInstr::MIFlag::NoUSWrap; + } + // Pass the combined immediate to the apply function. MatchInfo.Imm = AMNew.BaseOffs; MatchInfo.Base = Base; MatchInfo.Bank = getRegBank(Imm2); + MatchInfo.Flags = Flags; return true; } @@ -1838,6 +1857,7 @@ void CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI, Observer.changingInstr(MI); MI.getOperand(1).setReg(MatchInfo.Base); MI.getOperand(2).setReg(NewOffset.getReg(0)); + MI.setFlags(MatchInfo.Flags); Observer.changedInstr(MI); } @@ -4871,14 +4891,34 @@ bool CombinerHelper::matchReassocConstantInnerRHS(GPtrAdd &MI, if (!C2) return false; + // If both additions are nuw, the reassociated additions are also nuw. + // If the original G_PTR_ADD is additionally nusw, X and C are both not + // negative, so BASE+X is between BASE and BASE+(X+C). The new G_PTR_ADDs are + // therefore also nusw. + // If the original G_PTR_ADD is additionally inbounds (which implies nusw), + // the new G_PTR_ADDs are then also inbounds. + unsigned PtrAddFlags = MI.getFlags(); + unsigned AddFlags = RHS->getFlags(); + bool IsNoUWrap = PtrAddFlags & AddFlags & MachineInstr::MIFlag::NoUWrap; + bool IsNoUSWrap = IsNoUWrap && (PtrAddFlags & MachineInstr::MIFlag::NoUSWrap); + bool IsInBounds = IsNoUWrap && (PtrAddFlags & MachineInstr::MIFlag::InBounds); + unsigned Flags = 0; + if (IsNoUWrap) + Flags |= MachineInstr::MIFlag::NoUWrap; + if (IsNoUSWrap) + Flags |= MachineInstr::MIFlag::NoUSWrap; + if (IsInBounds) + Flags |= MachineInstr::MIFlag::InBounds; + MatchInfo = [=, &MI](MachineIRBuilder &B) { LLT PtrTy = MRI.getType(MI.getOperand(0).getReg()); auto NewBase = - Builder.buildPtrAdd(PtrTy, Src1Reg, RHS->getOperand(1).getReg()); + Builder.buildPtrAdd(PtrTy, Src1Reg, RHS->getOperand(1).getReg(), Flags); Observer.changingInstr(MI); MI.getOperand(1).setReg(NewBase.getReg(0)); MI.getOperand(2).setReg(RHS->getOperand(2).getReg()); + MI.setFlags(Flags); Observer.changedInstr(MI); }; return !reassociationCanBreakAddressingModePattern(MI); @@ -4897,6 +4937,25 @@ bool CombinerHelper::matchReassocConstantInnerLHS(GPtrAdd &MI, return false; auto *LHSPtrAdd = cast<GPtrAdd>(LHS); + + // Reassociating nuw additions preserves nuw. If both original G_PTR_ADDs are + // nuw and inbounds (which implies nusw), the offsets are both non-negative, + // so the new G_PTR_ADDs are also inbounds. + unsigned PtrAddFlags = MI.getFlags(); + unsigned LHSPtrAddFlags = LHSPtrAdd->getFlags(); + bool IsNoUWrap = PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::NoUWrap; + bool IsNoUSWrap = IsNoUWrap && (PtrAddFlags & LHSPtrAddFlags & + MachineInstr::MIFlag::NoUSWrap); + bool IsInBounds = IsNoUWrap && (PtrAddFlags & LHSPtrAddFlags & + MachineInstr::MIFlag::InBounds); + unsigned Flags = 0; + if (IsNoUWrap) + Flags |= MachineInstr::MIFlag::NoUWrap; + if (IsNoUSWrap) + Flags |= MachineInstr::MIFlag::NoUSWrap; + if (IsInBounds) + Flags |= MachineInstr::MIFlag::InBounds; + MatchInfo = [=, &MI](MachineIRBuilder &B) { // When we change LHSPtrAdd's offset register we might cause it to use a reg // before its def. Sink the instruction so the outer PTR_ADD to ensure this @@ -4907,9 +4966,11 @@ bool CombinerHelper::matchReassocConstantInnerLHS(GPtrAdd &MI, auto NewCst = B.buildConstant(MRI.getType(RHSReg), LHSCstOff->Value); Observer.changingInstr(MI); MI.getOperand(2).setReg(NewCst.getReg(0)); + MI.setFlags(Flags); Observer.changedInstr(MI); Observer.changingInstr(*LHSPtrAdd); LHSPtrAdd->getOperand(2).setReg(RHSReg); + LHSPtrAdd->setFlags(Flags); Observer.changedInstr(*LHSPtrAdd); }; return !reassociationCanBreakAddressingModePattern(MI); @@ -4933,11 +4994,30 @@ bool CombinerHelper::matchReassocFoldConstantsInSubTree( if (!C2) return false; + // Reassociating nuw additions preserves nuw. If both original G_PTR_ADDs are + // inbounds, reaching the same result in one G_PTR_ADD is also inbounds. + // The nusw constraints are satisfied because imm1+imm2 cannot exceed the + // largest signed integer that fits into the index type, which is the maximum + // size of allocated objects according to the IR Language Reference. + unsigned PtrAddFlags = MI.getFlags(); + unsigned LHSPtrAddFlags = LHSPtrAdd->getFlags(); + bool IsNoUWrap = PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::NoUWrap; + bool IsInBounds = + PtrAddFlags & LHSPtrAddFlags & MachineInstr::MIFlag::InBounds; + unsigned Flags = 0; + if (IsNoUWrap) + Flags |= MachineInstr::MIFlag::NoUWrap; + if (IsInBounds) { + Flags |= MachineInstr::MIFlag::InBounds; + Flags |= MachineInstr::MIFlag::NoUSWrap; + } + MatchInfo = [=, &MI](MachineIRBuilder &B) { auto NewCst = B.buildConstant(MRI.getType(Src2Reg), *C1 + *C2); Observer.changingInstr(MI); MI.getOperand(1).setReg(LHSSrc1); MI.getOperand(2).setReg(NewCst.getReg(0)); + MI.setFlags(Flags); Observer.changedInstr(MI); }; return !reassociationCanBreakAddressingModePattern(MI); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index bbfae57..8424a81 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2209,7 +2209,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, unsigned Op = ID == Intrinsic::lifetime_start ? TargetOpcode::LIFETIME_START : TargetOpcode::LIFETIME_END; - const AllocaInst *AI = dyn_cast<AllocaInst>(CI.getArgOperand(1)); + const AllocaInst *AI = dyn_cast<AllocaInst>(CI.getArgOperand(0)); if (!AI || !AI->isStaticAlloca()) return true; @@ -2522,6 +2522,9 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, Opc = ID == Intrinsic::vector_reduce_fadd ? TargetOpcode::G_VECREDUCE_SEQ_FADD : TargetOpcode::G_VECREDUCE_SEQ_FMUL; + if (!MRI->getType(VecSrc).isVector()) + Opc = ID == Intrinsic::vector_reduce_fadd ? TargetOpcode::G_FADD + : TargetOpcode::G_FMUL; MIRBuilder.buildInstr(Opc, {Dst}, {ScalarSrc, VecSrc}, MachineInstr::copyFlagsFromInstruction(CI)); return true; @@ -2556,6 +2559,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, getOrCreateVReg(*ConstantInt::getTrue(CI.getType()))); return true; case Intrinsic::amdgcn_cs_chain: + case Intrinsic::amdgcn_call_whole_wave: return translateCallBase(CI, MIRBuilder); case Intrinsic::fptrunc_round: { uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI); @@ -2786,11 +2790,14 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { if (CI.isInlineAsm()) return translateInlineAsm(CI, MIRBuilder); - diagnoseDontCall(CI); - Intrinsic::ID ID = F ? F->getIntrinsicID() : Intrinsic::not_intrinsic; - if (!F || ID == Intrinsic::not_intrinsic) - return translateCallBase(CI, MIRBuilder); + if (!F || ID == Intrinsic::not_intrinsic) { + if (translateCallBase(CI, MIRBuilder)) { + diagnoseDontCall(CI); + return true; + } + return false; + } assert(ID != Intrinsic::not_intrinsic && "unknown intrinsic"); @@ -3513,7 +3520,7 @@ void IRTranslator::finishPendingPhis() { Verifier.setCurrentInst(PI); #endif // ifndef NDEBUG - SmallSet<const MachineBasicBlock *, 16> SeenPreds; + SmallPtrSet<const MachineBasicBlock *, 16> SeenPreds; for (unsigned i = 0; i < PI->getNumIncomingValues(); ++i) { auto IRPred = PI->getIncomingBlock(i); ArrayRef<Register> ValRegs = getOrCreateVRegs(*PI->getIncomingValue(i)); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index d9d3569..008c188 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -5574,12 +5574,19 @@ LegalizerHelper::fewerElementsBitcast(MachineInstr &MI, unsigned int TypeIdx, unsigned NewElemCount = NarrowTy.getSizeInBits() / SrcTy.getScalarSizeInBits(); - LLT SrcNarrowTy = LLT::fixed_vector(NewElemCount, SrcTy.getElementType()); - - // Split the Src and Dst Reg into smaller registers SmallVector<Register> SrcVRegs, BitcastVRegs; - if (extractGCDType(SrcVRegs, DstTy, SrcNarrowTy, SrcReg) != SrcNarrowTy) - return UnableToLegalize; + if (NewElemCount == 1) { + LLT SrcNarrowTy = SrcTy.getElementType(); + + auto Unmerge = MIRBuilder.buildUnmerge(SrcNarrowTy, SrcReg); + getUnmergeResults(SrcVRegs, *Unmerge); + } else { + LLT SrcNarrowTy = LLT::fixed_vector(NewElemCount, SrcTy.getElementType()); + + // Split the Src and Dst Reg into smaller registers + if (extractGCDType(SrcVRegs, DstTy, SrcNarrowTy, SrcReg) != SrcNarrowTy) + return UnableToLegalize; + } // Build new smaller bitcast instructions // Not supporting Leftover types for now but will have to diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 8955dd0..e41fd81 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -1869,8 +1869,10 @@ static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI, case TargetOpcode::G_FSHR: case TargetOpcode::G_SMAX: case TargetOpcode::G_SMIN: + case TargetOpcode::G_SCMP: case TargetOpcode::G_UMAX: case TargetOpcode::G_UMIN: + case TargetOpcode::G_UCMP: case TargetOpcode::G_PTRMASK: case TargetOpcode::G_SADDO: case TargetOpcode::G_SSUBO: diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 5e50898..93f6e39 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -268,13 +268,16 @@ static Value *getMaskOperand(IntrinsicInst *II) { } } -// Return the corresponded deinterleaved mask, or nullptr if there is no valid -// mask. -static Value *getMask(Value *WideMask, unsigned Factor, - ElementCount LeafValueEC); - -static Value *getMask(Value *WideMask, unsigned Factor, - VectorType *LeafValueTy) { +// Return a pair of +// (1) The corresponded deinterleaved mask, or nullptr if there is no valid +// mask. +// (2) Some mask effectively skips a certain field, and this element is a mask +// in which inactive lanes represent fields that are skipped (i.e. "gaps"). +static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor, + ElementCount LeafValueEC); + +static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor, + VectorType *LeafValueTy) { return getMask(WideMask, Factor, LeafValueTy->getElementCount()); } @@ -379,22 +382,25 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); Value *Mask = nullptr; + auto GapMask = APInt::getAllOnes(Factor); if (LI) { LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); } else { // Check mask operand. Handle both all-true/false and interleaved mask. - Mask = getMask(getMaskOperand(II), Factor, VecTy); + std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, VecTy); if (!Mask) return false; LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: " << *Load << "\n"); + LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor + << " and actual factor " << GapMask.popcount() << "\n"); } // Try to create target specific intrinsics to replace the load and // shuffles. if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles, - Indices, Factor)) + Indices, Factor, GapMask)) // If Extracts is not empty, tryReplaceExtracts made changes earlier. return !Extracts.empty() || BinOpShuffleChanged; @@ -536,10 +542,15 @@ bool InterleavedAccessImpl::lowerInterleavedStore( } else { // Check mask operand. Handle both all-true/false and interleaved mask. unsigned LaneMaskLen = NumStoredElements / Factor; - Mask = getMask(getMaskOperand(II), Factor, - ElementCount::getFixed(LaneMaskLen)); + APInt GapMask(Factor, 0); + std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, + ElementCount::getFixed(LaneMaskLen)); if (!Mask) return false; + // We haven't supported gap mask for stores. Yet it is possible that we + // already changed the IR, hence returning true here. + if (GapMask.popcount() != Factor) + return true; LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: " << *Store << "\n"); @@ -556,34 +567,97 @@ bool InterleavedAccessImpl::lowerInterleavedStore( return true; } -static Value *getMask(Value *WideMask, unsigned Factor, - ElementCount LeafValueEC) { +// A wide mask <1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0> could be used to skip the +// last field in a factor-of-three interleaved store or deinterleaved load (in +// which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask. +// This helper function tries to detect this pattern and return the actual +// factor we're accessing, which is 2 in this example. +static void getGapMask(const Constant &MaskConst, unsigned Factor, + unsigned LeafMaskLen, APInt &GapMask) { + assert(GapMask.getBitWidth() == Factor); + for (unsigned F = 0U; F < Factor; ++F) { + bool AllZero = true; + for (unsigned Idx = 0U; Idx < LeafMaskLen; ++Idx) { + Constant *C = MaskConst.getAggregateElement(F + Idx * Factor); + if (!C->isZeroValue()) { + AllZero = false; + break; + } + } + // All mask bits on this field are zero, skipping it. + if (AllZero) + GapMask.clearBit(F); + } +} + +static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor, + ElementCount LeafValueEC) { + auto GapMask = APInt::getAllOnes(Factor); + if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) { if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID()); - F && F == Factor && llvm::all_equal(IMI->args())) { - return IMI->getArgOperand(0); + F && F == Factor) { + Value *RefArg = nullptr; + // Check if all the intrinsic arguments are the same, except those that + // are zeros, which we mark as gaps in the gap mask. + for (auto [Idx, Arg] : enumerate(IMI->args())) { + if (auto *C = dyn_cast<Constant>(Arg); C && C->isZeroValue()) { + GapMask.clearBit(Idx); + continue; + } + + if (!RefArg) + RefArg = Arg; + else if (RefArg != Arg) + return {nullptr, GapMask}; + } + + // In a very rare occasion, all the intrinsic arguments might be zeros, + // in which case we still want to return an all-zeros constant instead of + // nullptr. + return {RefArg ? RefArg : IMI->getArgOperand(0), GapMask}; } } + // Masks that are assembled from bitwise AND. + if (auto *AndOp = dyn_cast<BinaryOperator>(WideMask); + AndOp && AndOp->getOpcode() == Instruction::And) { + auto [MaskLHS, GapMaskLHS] = + getMask(AndOp->getOperand(0), Factor, LeafValueEC); + auto [MaskRHS, GapMaskRHS] = + getMask(AndOp->getOperand(1), Factor, LeafValueEC); + if (!MaskLHS || !MaskRHS) + return {nullptr, GapMask}; + // Using IRBuilder here so that any trivial constants could be folded right + // away. + return {IRBuilder<>(AndOp).CreateAnd(MaskLHS, MaskRHS), + GapMaskLHS & GapMaskRHS}; + } + if (auto *ConstMask = dyn_cast<Constant>(WideMask)) { if (auto *Splat = ConstMask->getSplatValue()) // All-ones or all-zeros mask. - return ConstantVector::getSplat(LeafValueEC, Splat); + return {ConstantVector::getSplat(LeafValueEC, Splat), GapMask}; if (LeafValueEC.isFixed()) { unsigned LeafMaskLen = LeafValueEC.getFixedValue(); + // First, check if we use a gap mask to skip some of the factors / fields. + getGapMask(*ConstMask, Factor, LeafMaskLen, GapMask); + SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr); // If this is a fixed-length constant mask, each lane / leaf has to // use the same mask. This is done by checking if every group with Factor // number of elements in the interleaved mask has homogeneous values. for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) { + if (!GapMask[Idx % Factor]) + continue; Constant *C = ConstMask->getAggregateElement(Idx); if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C) - return nullptr; + return {nullptr, GapMask}; LeafMask[Idx / Factor] = C; } - return ConstantVector::get(LeafMask); + return {ConstantVector::get(LeafMask), GapMask}; } } @@ -603,12 +677,13 @@ static Value *getMask(Value *WideMask, unsigned Factor, auto *LeafMaskTy = VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC); IRBuilder<> Builder(SVI); - return Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0), - uint64_t(0)); + return {Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0), + uint64_t(0)), + GapMask}; } } - return nullptr; + return {nullptr, GapMask}; } bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( @@ -639,9 +714,16 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( return false; // Check mask operand. Handle both all-true/false and interleaved mask. - Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI)); + APInt GapMask(Factor, 0); + std::tie(Mask, GapMask) = + getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI)); if (!Mask) return false; + // We haven't supported gap mask if it's deinterleaving using intrinsics. + // Yet it is possible that we already changed the IR, hence returning true + // here. + if (GapMask.popcount() != Factor) + return true; LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave" << " intrinsic " << *DI << " and factor = " @@ -680,10 +762,16 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( II->getIntrinsicID() != Intrinsic::vp_store) return false; // Check mask operand. Handle both all-true/false and interleaved mask. - Mask = getMask(getMaskOperand(II), Factor, - cast<VectorType>(InterleaveValues[0]->getType())); + APInt GapMask(Factor, 0); + std::tie(Mask, GapMask) = + getMask(getMaskOperand(II), Factor, + cast<VectorType>(InterleaveValues[0]->getType())); if (!Mask) return false; + // We haven't supported gap mask if it's interleaving using intrinsics. Yet + // it is possible that we already changed the IR, hence returning true here. + if (GapMask.popcount() != Factor) + return true; LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave" << " intrinsic " << *IntII << " and factor = " diff --git a/llvm/lib/CodeGen/LiveVariables.cpp b/llvm/lib/CodeGen/LiveVariables.cpp index 1f23418..c5dfdda 100644 --- a/llvm/lib/CodeGen/LiveVariables.cpp +++ b/llvm/lib/CodeGen/LiveVariables.cpp @@ -213,11 +213,7 @@ void LiveVariables::HandleVirtRegDef(Register Reg, MachineInstr &MI) { } /// FindLastPartialDef - Return the last partial def of the specified register. -/// Also returns the sub-registers that're defined by the instruction. -MachineInstr * -LiveVariables::FindLastPartialDef(Register Reg, - SmallSet<Register, 4> &PartDefRegs) { - Register LastDefReg = 0; +MachineInstr *LiveVariables::FindLastPartialDef(Register Reg) { unsigned LastDefDist = 0; MachineInstr *LastDef = nullptr; for (MCPhysReg SubReg : TRI->subregs(Reg)) { @@ -226,7 +222,6 @@ LiveVariables::FindLastPartialDef(Register Reg, continue; unsigned Dist = DistanceMap[Def]; if (Dist > LastDefDist) { - LastDefReg = SubReg; LastDef = Def; LastDefDist = Dist; } @@ -235,14 +230,6 @@ LiveVariables::FindLastPartialDef(Register Reg, if (!LastDef) return nullptr; - PartDefRegs.insert(LastDefReg); - for (MachineOperand &MO : LastDef->all_defs()) { - if (MO.getReg() == 0) - continue; - Register DefReg = MO.getReg(); - if (TRI->isSubRegister(Reg, DefReg)) - PartDefRegs.insert_range(TRI->subregs_inclusive(DefReg)); - } return LastDef; } @@ -261,27 +248,11 @@ void LiveVariables::HandlePhysRegUse(Register Reg, MachineInstr &MI) { // ... // = EAX // All of the sub-registers must have been defined before the use of Reg! - SmallSet<Register, 4> PartDefRegs; - MachineInstr *LastPartialDef = FindLastPartialDef(Reg, PartDefRegs); + MachineInstr *LastPartialDef = FindLastPartialDef(Reg); // If LastPartialDef is NULL, it must be using a livein register. if (LastPartialDef) { - LastPartialDef->addOperand(MachineOperand::CreateReg(Reg, true/*IsDef*/, - true/*IsImp*/)); - PhysRegDef[Reg.id()] = LastPartialDef; - SmallSet<MCPhysReg, 8> Processed; - for (MCPhysReg SubReg : TRI->subregs(Reg)) { - if (Processed.count(SubReg)) - continue; - if (PartDefRegs.count(SubReg)) - continue; - // This part of Reg was defined before the last partial def. It's killed - // here. - LastPartialDef->addOperand(MachineOperand::CreateReg(SubReg, - false/*IsDef*/, - true/*IsImp*/)); - PhysRegDef[SubReg] = LastPartialDef; - Processed.insert_range(TRI->subregs(SubReg)); - } + LastPartialDef->addOperand( + MachineOperand::CreateReg(Reg, /*IsDef=*/true, /*IsImp=*/true)); } } else if (LastDef && !PhysRegUse[Reg.id()] && !LastDef->findRegisterDefOperand(Reg, /*TRI=*/nullptr)) diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 3e99e57..bb70e78 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -124,6 +124,11 @@ public: bool initializeFrameInfo(PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF); + bool initializeSaveRestorePoints( + PerFunctionMIParsingState &PFS, + const std::vector<yaml::SaveRestorePointEntry> &YamlSRPoints, + SmallVectorImpl<MachineBasicBlock *> &SaveRestorePoints); + bool initializeCallSiteInfo(PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF); @@ -529,7 +534,7 @@ void MIRParserImpl::setupDebugValueTracking( unsigned MaxInstrNum = 0; for (auto &MBB : MF) for (auto &MI : MBB) - MaxInstrNum = std::max((unsigned)MI.peekDebugInstrNum(), MaxInstrNum); + MaxInstrNum = std::max(MI.peekDebugInstrNum(), MaxInstrNum); MF.setDebugInstrNumberingCount(MaxInstrNum); // Load any substitutions. @@ -867,18 +872,14 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS, MFI.setHasTailCall(YamlMFI.HasTailCall); MFI.setCalleeSavedInfoValid(YamlMFI.IsCalleeSavedInfoValid); MFI.setLocalFrameSize(YamlMFI.LocalFrameSize); - if (!YamlMFI.SavePoint.Value.empty()) { - MachineBasicBlock *MBB = nullptr; - if (parseMBBReference(PFS, MBB, YamlMFI.SavePoint)) - return true; - MFI.setSavePoint(MBB); - } - if (!YamlMFI.RestorePoint.Value.empty()) { - MachineBasicBlock *MBB = nullptr; - if (parseMBBReference(PFS, MBB, YamlMFI.RestorePoint)) - return true; - MFI.setRestorePoint(MBB); - } + SmallVector<MachineBasicBlock *, 4> SavePoints; + if (initializeSaveRestorePoints(PFS, YamlMFI.SavePoints, SavePoints)) + return true; + MFI.setSavePoints(SavePoints); + SmallVector<MachineBasicBlock *, 4> RestorePoints; + if (initializeSaveRestorePoints(PFS, YamlMFI.RestorePoints, RestorePoints)) + return true; + MFI.setRestorePoints(RestorePoints); std::vector<CalleeSavedInfo> CSIInfo; // Initialize the fixed frame objects. @@ -1093,6 +1094,21 @@ bool MIRParserImpl::initializeConstantPool(PerFunctionMIParsingState &PFS, return false; } +// Return true if basic block was incorrectly specified in MIR +bool MIRParserImpl::initializeSaveRestorePoints( + PerFunctionMIParsingState &PFS, + const std::vector<yaml::SaveRestorePointEntry> &YamlSRPoints, + SmallVectorImpl<MachineBasicBlock *> &SaveRestorePoints) { + MachineBasicBlock *MBB = nullptr; + for (const yaml::SaveRestorePointEntry &Entry : YamlSRPoints) { + if (parseMBBReference(PFS, MBB, Entry.Point.Value)) + return true; + SaveRestorePoints.push_back(MBB); + } + + return false; +} + bool MIRParserImpl::initializeJumpTableInfo(PerFunctionMIParsingState &PFS, const yaml::MachineJumpTable &YamlJTI) { MachineJumpTableInfo *JTI = PFS.MF.getOrCreateJumpTableInfo(YamlJTI.Kind); diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index ce1834a..7cc9192 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -150,6 +150,10 @@ static void convertMJTI(ModuleSlotTracker &MST, yaml::MachineJumpTable &YamlJTI, const MachineJumpTableInfo &JTI); static void convertMFI(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI, const MachineFrameInfo &MFI); +static void +convertSRPoints(ModuleSlotTracker &MST, + std::vector<yaml::SaveRestorePointEntry> &YamlSRPoints, + ArrayRef<MachineBasicBlock *> SaveRestorePoints); static void convertStackObjects(yaml::MachineFunction &YMF, const MachineFunction &MF, ModuleSlotTracker &MST, MFPrintState &State); @@ -355,14 +359,10 @@ static void convertMFI(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI, YamlMFI.HasTailCall = MFI.hasTailCall(); YamlMFI.IsCalleeSavedInfoValid = MFI.isCalleeSavedInfoValid(); YamlMFI.LocalFrameSize = MFI.getLocalFrameSize(); - if (MFI.getSavePoint()) { - raw_string_ostream StrOS(YamlMFI.SavePoint.Value); - StrOS << printMBBReference(*MFI.getSavePoint()); - } - if (MFI.getRestorePoint()) { - raw_string_ostream StrOS(YamlMFI.RestorePoint.Value); - StrOS << printMBBReference(*MFI.getRestorePoint()); - } + if (!MFI.getSavePoints().empty()) + convertSRPoints(MST, YamlMFI.SavePoints, MFI.getSavePoints()); + if (!MFI.getRestorePoints().empty()) + convertSRPoints(MST, YamlMFI.RestorePoints, MFI.getRestorePoints()); } static void convertEntryValueObjects(yaml::MachineFunction &YMF, @@ -616,6 +616,21 @@ static void convertMCP(yaml::MachineFunction &MF, } } +static void +convertSRPoints(ModuleSlotTracker &MST, + std::vector<yaml::SaveRestorePointEntry> &YamlSRPoints, + ArrayRef<MachineBasicBlock *> SRPoints) { + for (const auto &MBB : SRPoints) { + SmallString<16> Str; + yaml::SaveRestorePointEntry Entry; + raw_svector_ostream StrOS(Str); + StrOS << printMBBReference(*MBB); + Entry.Point = StrOS.str().str(); + Str.clear(); + YamlSRPoints.push_back(Entry); + } +} + static void convertMJTI(ModuleSlotTracker &MST, yaml::MachineJumpTable &YamlJTI, const MachineJumpTableInfo &JTI) { YamlJTI.Kind = JTI.getEntryKind(); diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 742de11..e359831 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -490,7 +490,7 @@ private: SmallSetVector<MachineInstr *, 8> MaybeDeadCopies; /// Multimap tracking debug users in current BB - DenseMap<MachineInstr *, SmallSet<MachineInstr *, 2>> CopyDbgUsers; + DenseMap<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> CopyDbgUsers; CopyTracker Tracker; diff --git a/llvm/lib/CodeGen/MachineDebugify.cpp b/llvm/lib/CodeGen/MachineDebugify.cpp index 1a20fe5..307f494 100644 --- a/llvm/lib/CodeGen/MachineDebugify.cpp +++ b/llvm/lib/CodeGen/MachineDebugify.cpp @@ -87,7 +87,7 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI, // Do this by introducing debug uses of each register definition. If that is // not possible (e.g. we have a phi or a meta instruction), emit a constant. uint64_t NextImm = 0; - SmallSet<DILocalVariable *, 16> VarSet; + SmallPtrSet<DILocalVariable *, 16> VarSet; const MCInstrDesc &DbgValDesc = TII.get(TargetOpcode::DBG_VALUE); for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::iterator FirstNonPHIIt = MBB.getFirstNonPHI(); diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp index e4b9938..a8306b2 100644 --- a/llvm/lib/CodeGen/MachineFrameInfo.cpp +++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp @@ -244,6 +244,22 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{ } OS << "\n"; } + OS << "save/restore points:\n"; + + if (!SavePoints.empty()) { + OS << "save points:\n"; + + for (auto &item : SavePoints) + OS << printMBBReference(*item) << "\n"; + } else + OS << "save points are empty\n"; + + if (!RestorePoints.empty()) { + OS << "restore points:\n"; + for (auto &item : RestorePoints) + OS << printMBBReference(*item) << "\n"; + } else + OS << "restore points are empty\n"; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index ec40f6a..82ba596 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -154,17 +154,17 @@ void ilist_alloc_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) { MBB->getParent()->deleteMachineBasicBlock(MBB); } -static inline Align getFnStackAlignment(const TargetSubtargetInfo *STI, - const Function &F) { +static inline Align getFnStackAlignment(const TargetSubtargetInfo &STI, + const Function &F) { if (auto MA = F.getFnStackAlign()) return *MA; - return STI->getFrameLowering()->getStackAlign(); + return STI.getFrameLowering()->getStackAlign(); } MachineFunction::MachineFunction(Function &F, const TargetMachine &Target, const TargetSubtargetInfo &STI, MCContext &Ctx, unsigned FunctionNum) - : F(F), Target(Target), STI(&STI), Ctx(Ctx) { + : F(F), Target(Target), STI(STI), Ctx(Ctx) { FunctionNumber = FunctionNum; init(); } @@ -195,7 +195,7 @@ void MachineFunction::init() { // We can realign the stack if the target supports it and the user hasn't // explicitly asked us not to. - bool CanRealignSP = STI->getFrameLowering()->isStackRealignable() && + bool CanRealignSP = STI.getFrameLowering()->isStackRealignable() && !F.hasFnAttribute("no-realign-stack"); bool ForceRealignSP = F.hasFnAttribute(Attribute::StackAlignment) || F.hasFnAttribute("stackrealign"); @@ -209,11 +209,11 @@ void MachineFunction::init() { FrameInfo->ensureMaxAlignment(*F.getFnStackAlign()); ConstantPool = new (Allocator) MachineConstantPool(getDataLayout()); - Alignment = STI->getTargetLowering()->getMinFunctionAlignment(); + Alignment = STI.getTargetLowering()->getMinFunctionAlignment(); if (!F.getAlign() && !F.hasOptSize()) Alignment = std::max(Alignment, - STI->getTargetLowering()->getPrefFunctionAlignment()); + STI.getTargetLowering()->getPrefFunctionAlignment()); // -fsanitize=function and -fsanitize=kcfi instrument indirect function calls // to load a type hash before the function label. Ensure functions are aligned diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp index 4da0184..d9e8484 100644 --- a/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -94,6 +94,22 @@ static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, return DebugLoc(); } +/// Check if target reg is contained in given lists, which are: +/// LocalDefsV as given list for virtual regs +/// LocalDefsP as given list for physical regs, in BitVector[RegUnit] form +static bool containsReg(SmallSetVector<Register, 32> LocalDefsV, + const BitVector &LocalDefsP, Register Reg, + const TargetRegisterInfo *TRI) { + if (Reg.isPhysical()) { + for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) + if (!LocalDefsP[Unit]) + return false; + + return true; + } + return LocalDefsV.contains(Reg); +} + /// finalizeBundle - Finalize a machine instruction bundle which includes /// a sequence of instructions starting from FirstMI to LastMI (exclusive). /// This routine adds a BUNDLE instruction to represent the bundle, it adds @@ -115,6 +131,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, Bundle.prepend(MIB); SmallSetVector<Register, 32> LocalDefs; + BitVector LocalDefsP(TRI->getNumRegUnits()); SmallSet<Register, 8> DeadDefSet; SmallSet<Register, 16> KilledDefSet; SmallSetVector<Register, 8> ExternUses; @@ -130,7 +147,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, if (!Reg) continue; - if (LocalDefs.contains(Reg)) { + if (containsReg(LocalDefs, LocalDefsP, Reg, TRI)) { MO.setIsInternalRead(); if (MO.isKill()) { // Internal def is now killed. @@ -165,8 +182,10 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, } } - if (!MO.isDead() && Reg.isPhysical()) - LocalDefs.insert_range(TRI->subregs(Reg)); + if (!MO.isDead() && Reg.isPhysical()) { + for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) + LocalDefsP.set(Unit); + } } // Set FrameSetup/FrameDestroy for the bundle. If any of the instructions diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 90005bd..3a9651c 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -3466,9 +3466,9 @@ bool SMSchedule::onlyHasLoopCarriedOutputOrOrderPreds( } /// Determine transitive dependences of unpipelineable instructions -SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes( +SmallPtrSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes( SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) { - SmallSet<SUnit *, 8> DoNotPipeline; + SmallPtrSet<SUnit *, 8> DoNotPipeline; SmallVector<SUnit *, 8> Worklist; for (auto &SU : SSD->SUnits) @@ -3498,7 +3498,7 @@ SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes( // and ensure that they are in stage 0. If unable to do so, return false. bool SMSchedule::normalizeNonPipelinedInstructions( SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) { - SmallSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI); + SmallPtrSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI); int NewLastCycle = INT_MIN; for (SUnit &SU : SSD->SUnits) { diff --git a/llvm/lib/CodeGen/MacroFusion.cpp b/llvm/lib/CodeGen/MacroFusion.cpp index 975a3fe..1db5301 100644 --- a/llvm/lib/CodeGen/MacroFusion.cpp +++ b/llvm/lib/CodeGen/MacroFusion.cpp @@ -79,7 +79,7 @@ bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU, FirstSU.ParentClusterIdx = Clusters.size(); SecondSU.ParentClusterIdx = Clusters.size(); - SmallSet<SUnit *, 8> Cluster{{&FirstSU, &SecondSU}}; + SmallPtrSet<SUnit *, 8> Cluster{{&FirstSU, &SecondSU}}; Clusters.push_back(Cluster); // TODO - If we want to chain more than two instructions, we need to create diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp index a93a89e..34a9d5d 100644 --- a/llvm/lib/CodeGen/PHIElimination.cpp +++ b/llvm/lib/CodeGen/PHIElimination.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -72,6 +73,7 @@ class PHIEliminationImpl { LiveIntervals *LIS = nullptr; MachineLoopInfo *MLI = nullptr; MachineDominatorTree *MDT = nullptr; + MachinePostDominatorTree *PDT = nullptr; /// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions /// in predecessor basic blocks. @@ -123,17 +125,22 @@ public: auto *MLIWrapper = P->getAnalysisIfAvailable<MachineLoopInfoWrapperPass>(); auto *MDTWrapper = P->getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>(); + auto *PDTWrapper = + P->getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>(); LV = LVWrapper ? &LVWrapper->getLV() : nullptr; LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; MLI = MLIWrapper ? &MLIWrapper->getLI() : nullptr; MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; + PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr; } PHIEliminationImpl(MachineFunction &MF, MachineFunctionAnalysisManager &AM) : LV(AM.getCachedResult<LiveVariablesAnalysis>(MF)), LIS(AM.getCachedResult<LiveIntervalsAnalysis>(MF)), MLI(AM.getCachedResult<MachineLoopAnalysis>(MF)), - MDT(AM.getCachedResult<MachineDominatorTreeAnalysis>(MF)), MFAM(&AM) {} + MDT(AM.getCachedResult<MachineDominatorTreeAnalysis>(MF)), + PDT(AM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF)), + MFAM(&AM) {} bool run(MachineFunction &MF); }; @@ -172,6 +179,7 @@ PHIEliminationPass::run(MachineFunction &MF, PA.preserve<LiveVariablesAnalysis>(); PA.preserve<SlotIndexesAnalysis>(); PA.preserve<MachineDominatorTreeAnalysis>(); + PA.preserve<MachinePostDominatorTreeAnalysis>(); PA.preserve<MachineLoopAnalysis>(); return PA; } @@ -197,6 +205,7 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<SlotIndexesWrapperPass>(); AU.addPreserved<LiveIntervalsWrapperPass>(); AU.addPreserved<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachinePostDominatorTreeWrapperPass>(); AU.addPreserved<MachineLoopInfoWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -204,15 +213,8 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const { bool PHIEliminationImpl::run(MachineFunction &MF) { MRI = &MF.getRegInfo(); - MachineDominatorTree *MDT = nullptr; - if (P) { - auto *MDTWrapper = - P->getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>(); - MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; - } else { - MDT = MFAM->getCachedResult<MachineDominatorTreeAnalysis>(MF); - } - MachineDomTreeUpdater MDTU(MDT, MachineDomTreeUpdater::UpdateStrategy::Lazy); + MachineDomTreeUpdater MDTU(MDT, PDT, + MachineDomTreeUpdater::UpdateStrategy::Lazy); bool Changed = false; diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 8de2c48..96c9cde6 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -145,7 +145,7 @@ static bool lowerObjCCall(Function &F, RTLIB::LibcallImpl NewFn, // FIXME: When RuntimeLibcalls is an analysis, check if the function is really // supported, and go through RTLIB::Libcall. - const char *NewFnName = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(NewFn); + StringRef NewFnName = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(NewFn); // If we haven't already looked up this function, check to see if the // program already contains a function with this name. @@ -587,12 +587,14 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const { break; case Intrinsic::exp: case Intrinsic::exp2: + case Intrinsic::log: Changed |= forEachCall(F, [&](CallInst *CI) { Type *Ty = CI->getArgOperand(0)->getType(); if (!isa<ScalableVectorType>(Ty)) return false; const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering(); unsigned Op = TL->IntrinsicIDToISD(F.getIntrinsicID()); + assert(Op != ISD::DELETED_NODE && "unsupported intrinsic"); if (!TL->isOperationExpand(Op, EVT::getEVT(Ty))) return false; return lowerUnaryVectorIntrinsicAsLoop(M, CI); diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index f66f546..8fc0748 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -351,8 +351,8 @@ bool PEIImpl::run(MachineFunction &MF) { delete RS; SaveBlocks.clear(); RestoreBlocks.clear(); - MFI.setSavePoint(nullptr); - MFI.setRestorePoint(nullptr); + MFI.setSavePoints({}); + MFI.setRestorePoints({}); return true; } @@ -423,16 +423,18 @@ void PEIImpl::calculateCallFrameInfo(MachineFunction &MF) { /// callee-saved registers, and placing prolog and epilog code. void PEIImpl::calculateSaveRestoreBlocks(MachineFunction &MF) { const MachineFrameInfo &MFI = MF.getFrameInfo(); - // Even when we do not change any CSR, we still want to insert the // prologue and epilogue of the function. // So set the save points for those. // Use the points found by shrink-wrapping, if any. - if (MFI.getSavePoint()) { - SaveBlocks.push_back(MFI.getSavePoint()); - assert(MFI.getRestorePoint() && "Both restore and save must be set"); - MachineBasicBlock *RestoreBlock = MFI.getRestorePoint(); + if (!MFI.getSavePoints().empty()) { + assert(MFI.getSavePoints().size() == 1 && + "Multiple save points are not yet supported!"); + SaveBlocks.push_back(MFI.getSavePoints().front()); + assert(MFI.getRestorePoints().size() == 1 && + "Multiple restore points are not yet supported!"); + MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front(); // If RestoreBlock does not have any successor and is not a return block // then the end point is unreachable and we do not need to insert any // epilogue. @@ -558,7 +560,11 @@ static void updateLiveness(MachineFunction &MF) { SmallPtrSet<MachineBasicBlock *, 8> Visited; SmallVector<MachineBasicBlock *, 8> WorkList; MachineBasicBlock *Entry = &MF.front(); - MachineBasicBlock *Save = MFI.getSavePoint(); + + assert(MFI.getSavePoints().size() < 2 && + "Multiple save points not yet supported!"); + MachineBasicBlock *Save = + MFI.getSavePoints().empty() ? nullptr : MFI.getSavePoints().front(); if (!Save) Save = Entry; @@ -569,7 +575,10 @@ static void updateLiveness(MachineFunction &MF) { } Visited.insert(Save); - MachineBasicBlock *Restore = MFI.getRestorePoint(); + assert(MFI.getRestorePoints().size() < 2 && + "Multiple restore points not yet supported!"); + MachineBasicBlock *Restore = + MFI.getRestorePoints().empty() ? nullptr : MFI.getRestorePoints().front(); if (Restore) // By construction Restore cannot be visited, otherwise it // means there exists a path to Restore that does not go @@ -1550,7 +1559,7 @@ void PEIImpl::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF, // If this instruction has a FrameIndex operand, we need to // use that target machine register info object to eliminate // it. - TRI.eliminateFrameIndex(MI, SPAdj, i); + TRI.eliminateFrameIndex(MI, SPAdj, i, RS); // Reset the iterator if we were at the beginning of the BB. if (AtBeginning) { diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 66a206c..804480c 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -474,6 +474,13 @@ int RegAllocFastImpl::getStackSpaceFor(Register VirtReg) { const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); unsigned Size = TRI->getSpillSize(RC); Align Alignment = TRI->getSpillAlign(RC); + + const MachineFunction &MF = MRI->getMF(); + auto &ST = MF.getSubtarget(); + Align CurrentAlign = ST.getFrameLowering()->getStackAlign(); + if (Alignment > CurrentAlign && !TRI->canRealignStack(MF)) + Alignment = CurrentAlign; + int FrameIdx = MFI->CreateSpillStackObject(Size, Alignment); // Assign the slot. diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp index ca51b67..5f37890 100644 --- a/llvm/lib/CodeGen/RegisterPressure.cpp +++ b/llvm/lib/CodeGen/RegisterPressure.cpp @@ -1001,7 +1001,7 @@ static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec, ++CritIdx; if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == i) { - int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].getUnitInc(); + int PDiff = (int)PNew - CriticalPSets[CritIdx].getUnitInc(); if (PDiff > 0) { Delta.CriticalMax = PressureChange(i); Delta.CriticalMax.setUnitInc(PDiff); @@ -1191,7 +1191,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff, ++CritIdx; if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) { - int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc(); + int CritInc = (int)MNew - CriticalPSets[CritIdx].getUnitInc(); if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) { Delta.CriticalMax = PressureChange(PSetID); Delta.CriticalMax.setUnitInc(CritInc); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 7341914..8446045 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -401,7 +401,7 @@ namespace { SDValue PromoteExtend(SDValue Op); bool PromoteLoad(SDValue Op); - SDValue foldShiftToAvg(SDNode *N); + SDValue foldShiftToAvg(SDNode *N, const SDLoc &DL); // Fold `a bitwiseop (~b +/- c)` -> `a bitwiseop ~(b -/+ c)` SDValue foldBitwiseOpWithNeg(SDNode *N, const SDLoc &DL, EVT VT); @@ -10983,7 +10983,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { if (SDValue NarrowLoad = reduceLoadWidth(N)) return NarrowLoad; - if (SDValue AVG = foldShiftToAvg(N)) + if (SDValue AVG = foldShiftToAvg(N, DL)) return AVG; return SDValue(); @@ -11256,7 +11256,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { if (SDValue MULH = combineShiftToMULH(N, DL, DAG, TLI)) return MULH; - if (SDValue AVG = foldShiftToAvg(N)) + if (SDValue AVG = foldShiftToAvg(N, DL)) return AVG; return SDValue(); @@ -11772,51 +11772,36 @@ static SDValue combineMinNumMaxNumImpl(const SDLoc &DL, EVT VT, SDValue LHS, } } -SDValue DAGCombiner::foldShiftToAvg(SDNode *N) { +// Convert (sr[al] (add n[su]w x, y)) -> (avgfloor[su] x, y) +SDValue DAGCombiner::foldShiftToAvg(SDNode *N, const SDLoc &DL) { const unsigned Opcode = N->getOpcode(); - - // Convert (sr[al] (add n[su]w x, y)) -> (avgfloor[su] x, y) if (Opcode != ISD::SRA && Opcode != ISD::SRL) return SDValue(); - unsigned FloorISD = 0; - auto VT = N->getValueType(0); - bool IsUnsigned = false; - - // Decide wether signed or unsigned. - switch (Opcode) { - case ISD::SRA: - if (!hasOperation(ISD::AVGFLOORS, VT)) - return SDValue(); - FloorISD = ISD::AVGFLOORS; - break; - case ISD::SRL: - IsUnsigned = true; - if (!hasOperation(ISD::AVGFLOORU, VT)) - return SDValue(); - FloorISD = ISD::AVGFLOORU; - break; - default: - return SDValue(); - } + EVT VT = N->getValueType(0); + bool IsUnsigned = Opcode == ISD::SRL; // Captured values. SDValue A, B, Add; // Match floor average as it is common to both floor/ceil avgs. - if (!sd_match(N, m_BinOp(Opcode, - m_AllOf(m_Value(Add), m_Add(m_Value(A), m_Value(B))), - m_One()))) - return SDValue(); + if (sd_match(N, m_BinOp(Opcode, + m_AllOf(m_Value(Add), m_Add(m_Value(A), m_Value(B))), + m_One()))) { + // Decide whether signed or unsigned. + unsigned FloorISD = IsUnsigned ? ISD::AVGFLOORU : ISD::AVGFLOORS; + if (!hasOperation(FloorISD, VT)) + return SDValue(); - // Can't optimize adds that may wrap. - if (IsUnsigned && !Add->getFlags().hasNoUnsignedWrap()) - return SDValue(); + // Can't optimize adds that may wrap. + if ((IsUnsigned && !Add->getFlags().hasNoUnsignedWrap()) || + (!IsUnsigned && !Add->getFlags().hasNoSignedWrap())) + return SDValue(); - if (!IsUnsigned && !Add->getFlags().hasNoSignedWrap()) - return SDValue(); + return DAG.getNode(FloorISD, DL, N->getValueType(0), {A, B}); + } - return DAG.getNode(FloorISD, SDLoc(N), N->getValueType(0), {A, B}); + return SDValue(); } SDValue DAGCombiner::foldBitwiseOpWithNeg(SDNode *N, const SDLoc &DL, EVT VT) { @@ -12843,22 +12828,21 @@ SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) { SDLoc DL(HG); EVT MemVT = HG->getMemoryVT(); + EVT DataVT = Index.getValueType(); MachineMemOperand *MMO = HG->getMemOperand(); ISD::MemIndexType IndexType = HG->getIndexType(); if (ISD::isConstantSplatVectorAllZeros(Mask.getNode())) return Chain; - SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index, - HG->getScale(), HG->getIntID()}; - if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL)) + if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL) || + refineIndexType(Index, IndexType, DataVT, DAG)) { + SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index, + HG->getScale(), HG->getIntID()}; return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops, MMO, IndexType); + } - EVT DataVT = Index.getValueType(); - if (refineIndexType(Index, IndexType, DataVT, DAG)) - return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops, - MMO, IndexType); return SDValue(); } @@ -16343,6 +16327,42 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { DAG, DL); } break; + case ISD::AVGFLOORS: + case ISD::AVGFLOORU: + case ISD::AVGCEILS: + case ISD::AVGCEILU: + case ISD::ABDS: + case ISD::ABDU: + // (trunc (avg a, b)) -> (avg (trunc a), (trunc b)) + // (trunc (abdu/abds a, b)) -> (abdu/abds (trunc a), (trunc b)) + if (!LegalOperations && N0.hasOneUse() && + TLI.isOperationLegal(N0.getOpcode(), VT)) { + EVT TruncVT = VT; + unsigned SrcBits = SrcVT.getScalarSizeInBits(); + unsigned TruncBits = TruncVT.getScalarSizeInBits(); + + SDValue A = N0.getOperand(0); + SDValue B = N0.getOperand(1); + bool CanFold = false; + + if (N0.getOpcode() == ISD::AVGFLOORU || N0.getOpcode() == ISD::AVGCEILU || + N0.getOpcode() == ISD::ABDU) { + APInt UpperBits = APInt::getBitsSetFrom(SrcBits, TruncBits); + CanFold = DAG.MaskedValueIsZero(B, UpperBits) && + DAG.MaskedValueIsZero(A, UpperBits); + } else { + unsigned NeededBits = SrcBits - TruncBits; + CanFold = DAG.ComputeNumSignBits(B) > NeededBits && + DAG.ComputeNumSignBits(A) > NeededBits; + } + + if (CanFold) { + SDValue NewA = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, A); + SDValue NewB = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, B); + return DAG.getNode(N0.getOpcode(), DL, TruncVT, NewA, NewB); + } + } + break; } return SDValue(); @@ -25987,7 +26007,10 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { // Combine an extract of an extract into a single extract_subvector. // ext (ext X, C), 0 --> ext X, C if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) { - if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(), + // The index has to be a multiple of the new result type's known minimum + // vector length. + if (V.getConstantOperandVal(1) % NVT.getVectorMinNumElements() == 0 && + TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(), V.getConstantOperandVal(1)) && TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) { return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, V.getOperand(0), diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index fb9eff9..9467ba1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -729,9 +729,7 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx, assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic."); - ArgListEntry Entry; - Entry.Val = V; - Entry.Ty = V->getType(); + ArgListEntry Entry(V); Entry.setAttributes(CI, ArgI); Args.push_back(Entry); } @@ -978,9 +976,7 @@ bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol, assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic."); - ArgListEntry Entry; - Entry.Val = V; - Entry.Ty = V->getType(); + ArgListEntry Entry(V); Entry.setAttributes(CI, ArgI); Args.push_back(Entry); } @@ -1012,17 +1008,16 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) { MVT RegisterVT = TLI.getRegisterType(CLI.RetTy->getContext(), VT); unsigned NumRegs = TLI.getNumRegisters(CLI.RetTy->getContext(), VT); for (unsigned i = 0; i != NumRegs; ++i) { - ISD::InputArg MyFlags; - MyFlags.VT = RegisterVT; - MyFlags.ArgVT = VT; - MyFlags.Used = CLI.IsReturnValueUsed; + ISD::ArgFlagsTy Flags; if (CLI.RetSExt) - MyFlags.Flags.setSExt(); + Flags.setSExt(); if (CLI.RetZExt) - MyFlags.Flags.setZExt(); + Flags.setZExt(); if (CLI.IsInReg) - MyFlags.Flags.setInReg(); - CLI.Ins.push_back(MyFlags); + Flags.setInReg(); + ISD::InputArg Ret(Flags, RegisterVT, VT, CLI.RetTy, CLI.IsReturnValueUsed, + ISD::InputArg::NoArgIndex, 0); + CLI.Ins.push_back(Ret); } } @@ -1117,7 +1112,6 @@ bool FastISel::lowerCall(const CallInst *CI) { Type *RetTy = CI->getType(); ArgListTy Args; - ArgListEntry Entry; Args.reserve(CI->arg_size()); for (auto i = CI->arg_begin(), e = CI->arg_end(); i != e; ++i) { @@ -1127,9 +1121,7 @@ bool FastISel::lowerCall(const CallInst *CI) { if (V->getType()->isEmptyTy()) continue; - Entry.Val = V; - Entry.Ty = V->getType(); - + ArgListEntry Entry(V); // Skip the first return-type Attribute to get to params. Entry.setAttributes(CI, i - CI->arg_begin()); Args.push_back(Entry); @@ -1148,9 +1140,12 @@ bool FastISel::lowerCall(const CallInst *CI) { CLI.setCallee(RetTy, FuncTy, CI->getCalledOperand(), std::move(Args), *CI) .setTailCall(IsTailCall); - diagnoseDontCall(*CI); + if (lowerCallTo(CLI)) { + diagnoseDontCall(*CI); + return true; + } - return lowerCallTo(CLI); + return false; } bool FastISel::selectCall(const User *I) { diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 8c8daef..1a63518 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -81,12 +81,11 @@ static unsigned countOperands(SDNode *Node, unsigned NumExpUses, /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an /// implicit physical register output. -void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, - Register SrcReg, VRBaseMapType &VRBaseMap) { +void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg, + VRBaseMapType &VRBaseMap) { Register VRBase; if (SrcReg.isVirtual()) { // Just use the input register directly! - SDValue Op(Node, ResNo); if (IsClone) VRBaseMap.erase(Op); bool isNew = VRBaseMap.insert(std::make_pair(Op, SrcReg)).second; @@ -99,17 +98,15 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, // the CopyToReg'd destination register instead of creating a new vreg. bool MatchReg = true; const TargetRegisterClass *UseRC = nullptr; - MVT VT = Node->getSimpleValueType(ResNo); + MVT VT = Op.getSimpleValueType(); // Stick to the preferred register classes for legal types. if (TLI->isTypeLegal(VT)) - UseRC = TLI->getRegClassFor(VT, Node->isDivergent()); + UseRC = TLI->getRegClassFor(VT, Op->isDivergent()); - for (SDNode *User : Node->users()) { + for (SDNode *User : Op->users()) { bool Match = true; - if (User->getOpcode() == ISD::CopyToReg && - User->getOperand(2).getNode() == Node && - User->getOperand(2).getResNo() == ResNo) { + if (User->getOpcode() == ISD::CopyToReg && User->getOperand(2) == Op) { Register DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg(); if (DestReg.isVirtual()) { VRBase = DestReg; @@ -118,10 +115,8 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, Match = false; } else { for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { - SDValue Op = User->getOperand(i); - if (Op.getNode() != Node || Op.getResNo() != ResNo) + if (User->getOperand(i) != Op) continue; - MVT VT = Node->getSimpleValueType(Op.getResNo()); if (VT == MVT::Other || VT == MVT::Glue) continue; Match = false; @@ -170,11 +165,11 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, } else { // Create the reg, emit the copy. VRBase = MRI->createVirtualRegister(DstRC); - BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY), - VRBase).addReg(SrcReg); + BuildMI(*MBB, InsertPos, Op.getDebugLoc(), TII->get(TargetOpcode::COPY), + VRBase) + .addReg(SrcReg); } - SDValue Op(Node, ResNo); if (IsClone) VRBaseMap.erase(Op); bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second; @@ -1170,7 +1165,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, continue; // This implicitly defined physreg has a use. UsedRegs.push_back(Reg); - EmitCopyFromReg(Node, i, IsClone, Reg, VRBaseMap); + EmitCopyFromReg(SDValue(Node, i), IsClone, Reg, VRBaseMap); } } @@ -1178,7 +1173,9 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, if (Node->getValueType(Node->getNumValues()-1) == MVT::Glue) { for (SDNode *F = Node->getGluedUser(); F; F = F->getGluedUser()) { if (F->getOpcode() == ISD::CopyFromReg) { - UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg()); + Register Reg = cast<RegisterSDNode>(F->getOperand(1))->getReg(); + if (Reg.isPhysical()) + UsedRegs.push_back(Reg); continue; } else if (F->getOpcode() == ISD::CopyToReg) { // Skip CopyToReg nodes that are internal to the glue chain. @@ -1281,7 +1278,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned, } case ISD::CopyFromReg: { Register SrcReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg(); - EmitCopyFromReg(Node, 0, IsClone, SrcReg, VRBaseMap); + EmitCopyFromReg(SDValue(Node, 0), IsClone, SrcReg, VRBaseMap); break; } case ISD::EH_LABEL: diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h index 16d754c..b465de8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h @@ -48,8 +48,8 @@ private: /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an /// implicit physical register output. - void EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, - Register SrcReg, VRBaseMapType &VRBaseMap); + void EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg, + VRBaseMapType &VRBaseMap); void CreateVirtualRegisters(SDNode *Node, MachineInstrBuilder &MIB, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index ba0ab23..bcfc2c5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2181,12 +2181,10 @@ SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned) { TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; for (const SDValue &Op : Node->op_values()) { EVT ArgVT = Op.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - Entry.Node = Op; - Entry.Ty = ArgTy; + TargetLowering::ArgListEntry Entry(Op, ArgTy); Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgTy, isSigned); Entry.IsZExt = !Entry.IsSExt; Args.push_back(Entry); @@ -2325,11 +2323,9 @@ SDValue SelectionDAGLegalize::ExpandBitCountingLibCall( EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), DAG.getLibInfo().getIntSize()); - TargetLowering::ArgListEntry Arg; EVT ArgVT = Op.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - Arg.Node = Op; - Arg.Ty = ArgTy; + TargetLowering::ArgListEntry Arg(Op, ArgTy); Arg.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgTy, /*IsSigned=*/false); Arg.IsZExt = !Arg.IsSExt; @@ -2370,12 +2366,10 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; for (const SDValue &Op : Node->op_values()) { EVT ArgVT = Op.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - Entry.Node = Op; - Entry.Ty = ArgTy; + TargetLowering::ArgListEntry Entry(Op, ArgTy); Entry.IsSExt = isSigned; Entry.IsZExt = !isSigned; Args.push_back(Entry); @@ -2383,8 +2377,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, // Also pass the return address of the remainder. SDValue FIPtr = DAG.CreateStackTemporary(RetVT); - Entry.Node = FIPtr; - Entry.Ty = PointerType::getUnqual(RetTy->getContext()); + TargetLowering::ArgListEntry Entry( + FIPtr, PointerType::getUnqual(RetTy->getContext())); Entry.IsSExt = isSigned; Entry.IsZExt = !isSigned; Args.push_back(Entry); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 2cad36e..83bb1df 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -197,7 +197,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_Unary(SDNode *N, RTLIB::Libcall LC) { SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); TargetLowering::MakeLibCallOptions CallOptions; EVT OpVT = N->getOperand(0 + Offset).getValueType(); - CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true); + CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0)); std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, SDLoc(N), Chain); @@ -218,7 +218,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_Binary(SDNode *N, RTLIB::Libcall LC) { TargetLowering::MakeLibCallOptions CallOptions; EVT OpsVT[2] = { N->getOperand(0 + Offset).getValueType(), N->getOperand(1 + Offset).getValueType() }; - CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true); + CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0)); std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Ops, CallOptions, SDLoc(N), Chain); @@ -558,7 +558,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) { EVT OpsVT[3] = { N->getOperand(0 + Offset).getValueType(), N->getOperand(1 + Offset).getValueType(), N->getOperand(2 + Offset).getValueType() }; - CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true); + CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0)); std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0), RTLIB::FMA_F32, @@ -642,7 +642,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!"); TargetLowering::MakeLibCallOptions CallOptions; EVT OpVT = N->getOperand(IsStrict ? 1 : 0).getValueType(); - CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true); + CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0)); std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, SDLoc(N), Chain); @@ -658,7 +658,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) { SDValue Op = N->getOperand(0); TargetLowering::MakeLibCallOptions CallOptions; EVT OpsVT[1] = { N->getOperand(0).getValueType() }; - CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true); + CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0)); SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, Op, CallOptions, SDLoc(N)).first; if (N->getValueType(0) == MVT::f32) @@ -694,7 +694,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) { assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!"); TargetLowering::MakeLibCallOptions CallOptions; EVT OpVT = N->getOperand(IsStrict ? 1 : 0).getValueType(); - CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true); + CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0)); std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, SDLoc(N), Chain); @@ -742,7 +742,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_ExpOp(SDNode *N) { TargetLowering::MakeLibCallOptions CallOptions; EVT OpsVT[2] = { N->getOperand(0 + Offset).getValueType(), N->getOperand(1 + Offset).getValueType() }; - CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true); + CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0)); std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Ops, CallOptions, SDLoc(N), Chain); @@ -779,7 +779,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FFREXP(SDNode *N) { // TODO: setTypeListBeforeSoften can't properly express multiple return types, // but we only really need to handle the 0th one for softening anyway. - CallOptions.setTypeListBeforeSoften({OpsVT}, VT0, true) + CallOptions.setTypeListBeforeSoften({OpsVT}, VT0) .setOpsTypeOverrides(CallOpsTypeOverrides); auto [ReturnVal, Chain] = TLI.makeLibCall(DAG, LC, NVT0, Ops, CallOptions, DL, @@ -828,7 +828,7 @@ bool DAGTypeLegalizer::SoftenFloatRes_UnaryWithTwoFPResults( TargetLowering::MakeLibCallOptions CallOptions; // TODO: setTypeListBeforeSoften can't properly express multiple return types, // but since both returns have the same type it should be okay. - CallOptions.setTypeListBeforeSoften({OpsVT}, VT, true) + CallOptions.setTypeListBeforeSoften({OpsVT}, VT) .setOpsTypeOverrides(CallOpsTypeOverrides); auto [ReturnVal, Chain] = TLI.makeLibCall(DAG, LC, NVT, Ops, CallOptions, DL, @@ -1100,7 +1100,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) { NVT, N->getOperand(IsStrict ? 1 : 0)); TargetLowering::MakeLibCallOptions CallOptions; CallOptions.setIsSigned(Signed); - CallOptions.setTypeListBeforeSoften(SVT, RVT, true); + CallOptions.setTypeListBeforeSoften(SVT, RVT); std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, TLI.getTypeToTransformTo(*DAG.getContext(), RVT), Op, CallOptions, dl, Chain); @@ -1222,7 +1222,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) { SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); Op = GetSoftenedFloat(Op); TargetLowering::MakeLibCallOptions CallOptions; - CallOptions.setTypeListBeforeSoften(SVT, RVT, true); + CallOptions.setTypeListBeforeSoften(SVT, RVT); std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, SDLoc(N), Chain); @@ -1298,7 +1298,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) { Op = GetSoftenedFloat(Op); SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); TargetLowering::MakeLibCallOptions CallOptions; - CallOptions.setTypeListBeforeSoften(SVT, RVT, true); + CallOptions.setTypeListBeforeSoften(SVT, RVT); std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, dl, Chain); @@ -1453,7 +1453,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_Unary(SDNode *N, RTLIB::Libcall LC) { SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); TargetLowering::MakeLibCallOptions CallOptions; EVT OpVT = N->getOperand(0 + Offset).getValueType(); - CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true); + CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0)); std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, SDLoc(N), Chain); @@ -1551,6 +1551,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { case ISD::VAARG: ExpandRes_VAARG(N, Lo, Hi); break; case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break; + case ISD::AssertNoFPClass: ExpandFloatRes_AssertNoFPClass(N, Lo, Hi); break; case ISD::FABS: ExpandFloatRes_FABS(N, Lo, Hi); break; case ISD::STRICT_FMINNUM: case ISD::FMINNUM: ExpandFloatRes_FMINNUM(N, Lo, Hi); break; @@ -1966,6 +1967,13 @@ void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo, Hi = DAG.getNode(ISD::FNEG, dl, Hi.getValueType(), Hi); } +void DAGTypeLegalizer::ExpandFloatRes_AssertNoFPClass(SDNode *N, SDValue &Lo, + SDValue &Hi) { + // TODO: Handle ppcf128 by preserving AssertNoFPClass for one of the halves. + SDLoc dl(N); + GetExpandedFloat(N->getOperand(0), Lo, Hi); +} + void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); @@ -3559,7 +3567,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FP_ROUND(SDNode *N) { SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); Op = GetSoftenedFloat(Op); TargetLowering::MakeLibCallOptions CallOptions; - CallOptions.setTypeListBeforeSoften(SVT, RVT, true); + CallOptions.setTypeListBeforeSoften(SVT, RVT); std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, SDLoc(N), Chain); if (IsStrict) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index a5bd97a..90d62e6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -5260,20 +5260,18 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N, MachinePointerInfo()); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; for (const SDValue &Op : N->op_values()) { EVT ArgVT = Op.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - Entry.Node = Op; - Entry.Ty = ArgTy; + TargetLowering::ArgListEntry Entry(Op, ArgTy); Entry.IsSExt = true; Entry.IsZExt = false; Args.push_back(Entry); } // Also pass the address of the overflow check. - Entry.Node = Temp; - Entry.Ty = PointerType::getUnqual(PtrTy->getContext()); + TargetLowering::ArgListEntry Entry( + Temp, PointerType::getUnqual(PtrTy->getContext())); Entry.IsSExt = true; Entry.IsZExt = false; Args.push_back(Entry); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 63544e6..33fa301 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -681,6 +681,7 @@ private: SDNode *N, RTLIB::Libcall LC, std::optional<unsigned> CallRetResNo = {}); // clang-format off + void ExpandFloatRes_AssertNoFPClass(SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FABS (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FACOS (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FASIN (SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index d2ecc133..2ca9895 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -2223,17 +2223,13 @@ bool VectorLegalizer::tryExpandVecMathCall(SDNode *Node, RTLIB::Libcall LC, SDLoc DL(Node); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.IsSExt = false; - Entry.IsZExt = false; unsigned OpNum = 0; for (auto &VFParam : OptVFInfo->Shape.Parameters) { if (VFParam.ParamKind == VFParamKind::GlobalPredicate) { EVT MaskVT = TLI.getSetCCResultType(DAG.getDataLayout(), *Ctx, VT); - Entry.Node = DAG.getBoolConstant(true, DL, MaskVT, VT); - Entry.Ty = MaskVT.getTypeForEVT(*Ctx); - Args.push_back(Entry); + Args.emplace_back(DAG.getBoolConstant(true, DL, MaskVT, VT), + MaskVT.getTypeForEVT(*Ctx)); continue; } @@ -2241,9 +2237,7 @@ bool VectorLegalizer::tryExpandVecMathCall(SDNode *Node, RTLIB::Libcall LC, if (VFParam.ParamKind != VFParamKind::Vector) return false; - Entry.Node = Node->getOperand(OpNum++); - Entry.Ty = Ty; - Args.push_back(Entry); + Args.emplace_back(Node->getOperand(OpNum++), Ty); } // Emit a call to the vector function. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index b9e72c9..23102d3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1371,7 +1371,7 @@ void SelectionDAG::init(MachineFunction &NewMF, const TargetLibraryInfo *LibraryInfo, UniformityInfo *NewUA, ProfileSummaryInfo *PSIin, BlockFrequencyInfo *BFIin, MachineModuleInfo &MMIin, - FunctionVarLocs const *VarLocs, bool HasDivergency) { + FunctionVarLocs const *VarLocs) { MF = &NewMF; SDAGISelPass = PassPtr; ORE = &NewORE; @@ -1384,7 +1384,6 @@ void SelectionDAG::init(MachineFunction &NewMF, BFI = BFIin; MMI = &MMIin; FnVarLocs = VarLocs; - DivergentTarget = HasDivergency; } SelectionDAG::~SelectionDAG() { @@ -2331,8 +2330,7 @@ SDValue SelectionDAG::getRegister(Register Reg, EVT VT) { return SDValue(E, 0); auto *N = newSDNode<RegisterSDNode>(Reg, VTs); - N->SDNodeBits.IsDivergent = - DivergentTarget && TLI->isSDNodeSourceOfDivergence(N, FLI, UA); + N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, UA); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); @@ -2578,18 +2576,12 @@ bool SelectionDAG::expandMultipleResultFPLibCall( } TargetLowering::ArgListTy Args; - auto AddArgListEntry = [&](SDValue Node, Type *Ty) { - TargetLowering::ArgListEntry Entry{}; - Entry.Ty = Ty; - Entry.Node = Node; - Args.push_back(Entry); - }; // Pass the arguments. for (const SDValue &Op : Node->op_values()) { EVT ArgVT = Op.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(Ctx); - AddArgListEntry(Op, ArgTy); + Args.emplace_back(Op, ArgTy); } // Pass the output pointers. @@ -2601,7 +2593,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall( EVT ResVT = Node->getValueType(ResNo); SDValue ResultPtr = ST ? ST->getBasePtr() : CreateStackTemporary(ResVT); ResultPtrs[ResNo] = ResultPtr; - AddArgListEntry(ResultPtr, PointerTy); + Args.emplace_back(ResultPtr, PointerTy); } SDLoc DL(Node); @@ -2610,7 +2602,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall( if (VD && VD->isMasked()) { EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), Ctx, VT); SDValue Mask = getBoolConstant(true, DL, MaskVT, VT); - AddArgListEntry(Mask, MaskVT.getTypeForEVT(Ctx)); + Args.emplace_back(Mask, MaskVT.getTypeForEVT(Ctx)); } Type *RetType = CallRetResNo.has_value() @@ -5462,6 +5454,83 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op, } return true; + case ISD::EXTRACT_SUBVECTOR: { + SDValue Src = Op.getOperand(0); + if (Src.getValueType().isScalableVector()) + break; + uint64_t Idx = Op.getConstantOperandVal(1); + unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); + APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx); + return isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrcElts, PoisonOnly, + Depth + 1); + } + + case ISD::INSERT_SUBVECTOR: { + if (Op.getValueType().isScalableVector()) + break; + SDValue Src = Op.getOperand(0); + SDValue Sub = Op.getOperand(1); + uint64_t Idx = Op.getConstantOperandVal(2); + unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); + APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx); + APInt DemandedSrcElts = DemandedElts; + DemandedSrcElts.clearBits(Idx, Idx + NumSubElts); + + if (!!DemandedSubElts && !isGuaranteedNotToBeUndefOrPoison( + Sub, DemandedSubElts, PoisonOnly, Depth + 1)) + return false; + if (!!DemandedSrcElts && !isGuaranteedNotToBeUndefOrPoison( + Src, DemandedSrcElts, PoisonOnly, Depth + 1)) + return false; + return true; + } + + case ISD::EXTRACT_VECTOR_ELT: { + SDValue Src = Op.getOperand(0); + auto *IndexC = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + EVT SrcVT = Src.getValueType(); + if (SrcVT.isFixedLengthVector() && IndexC && + IndexC->getAPIntValue().ult(SrcVT.getVectorNumElements())) { + APInt DemandedSrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), + IndexC->getZExtValue()); + return isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrcElts, PoisonOnly, + Depth + 1); + } + break; + } + + case ISD::INSERT_VECTOR_ELT: { + SDValue InVec = Op.getOperand(0); + SDValue InVal = Op.getOperand(1); + SDValue EltNo = Op.getOperand(2); + EVT VT = InVec.getValueType(); + auto *IndexC = dyn_cast<ConstantSDNode>(EltNo); + if (IndexC && VT.isFixedLengthVector() && + IndexC->getAPIntValue().ult(VT.getVectorNumElements())) { + if (DemandedElts[IndexC->getZExtValue()] && + !isGuaranteedNotToBeUndefOrPoison(InVal, PoisonOnly, Depth + 1)) + return false; + APInt InVecDemandedElts = DemandedElts; + InVecDemandedElts.clearBit(IndexC->getZExtValue()); + if (!!InVecDemandedElts && + !isGuaranteedNotToBeUndefOrPoison(InVec, InVecDemandedElts, + PoisonOnly, Depth + 1)) + return false; + return true; + } + break; + } + + case ISD::SCALAR_TO_VECTOR: + // Check upper (known undef) elements. + if (DemandedElts.ugt(1) && !PoisonOnly) + return false; + // Check element zero. + if (DemandedElts[0] && !isGuaranteedNotToBeUndefOrPoison( + Op.getOperand(0), PoisonOnly, Depth + 1)) + return false; + return true; + case ISD::SPLAT_VECTOR: return isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), PoisonOnly, Depth + 1); @@ -5484,6 +5553,52 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op, return true; } + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: + // Shift amount operand is checked by canCreateUndefOrPoison. So it is + // enough to check operand 0 if Op can't create undef/poison. + return !canCreateUndefOrPoison(Op, DemandedElts, PoisonOnly, + /*ConsiderFlags*/ true, Depth) && + isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedElts, + PoisonOnly, Depth + 1); + + case ISD::BSWAP: + case ISD::CTPOP: + case ISD::BITREVERSE: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + case ISD::SADDSAT: + case ISD::UADDSAT: + case ISD::SSUBSAT: + case ISD::USUBSAT: + case ISD::SSHLSAT: + case ISD::USHLSAT: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ANY_EXTEND: + case ISD::TRUNCATE: + case ISD::VSELECT: { + // If Op can't create undef/poison and none of its operands are undef/poison + // then Op is never undef/poison. A difference from the more common check + // below, outside the switch, is that we handle elementwise operations for + // which the DemandedElts mask is valid for all operands here. + return !canCreateUndefOrPoison(Op, DemandedElts, PoisonOnly, + /*ConsiderFlags*/ true, Depth) && + all_of(Op->ops(), [&](SDValue V) { + return isGuaranteedNotToBeUndefOrPoison(V, DemandedElts, + PoisonOnly, Depth + 1); + }); + } + // TODO: Search for noundef attributes from library functions. // TODO: Pointers dereferenced by ISD::LOAD/STORE ops are noundef. @@ -5549,8 +5664,10 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::ABDS: case ISD::SMIN: case ISD::SMAX: + case ISD::SCMP: case ISD::UMIN: case ISD::UMAX: + case ISD::UCMP: case ISD::AND: case ISD::XOR: case ISD::ROTL: @@ -5630,7 +5747,11 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::FDIV: case ISD::FREM: case ISD::FCOPYSIGN: + case ISD::FMA: + case ISD::FMAD: case ISD::FP_EXTEND: + case ISD::FP_TO_SINT_SAT: + case ISD::FP_TO_UINT_SAT: // No poison except from flags (which is handled above) return false; @@ -8896,18 +9017,11 @@ SelectionDAG::getMemcmp(SDValue Chain, const SDLoc &dl, SDValue Mem0, if (!LibCallName) return {}; - // Emit a library call. - auto GetEntry = [](Type *Ty, SDValue &SDV) { - TargetLowering::ArgListEntry E; - E.Ty = Ty; - E.Node = SDV; - return E; - }; - PointerType *PT = PointerType::getUnqual(*getContext()); TargetLowering::ArgListTy Args = { - GetEntry(PT, Mem0), GetEntry(PT, Mem1), - GetEntry(getDataLayout().getIntPtrType(*getContext()), Size)}; + {Mem0, PT}, + {Mem1, PT}, + {Size, getDataLayout().getIntPtrType(*getContext())}}; TargetLowering::CallLoweringInfo CLI(*this); bool IsTailCall = false; @@ -8978,13 +9092,10 @@ SDValue SelectionDAG::getMemcpy( // Emit a library call. TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Ty = PointerType::getUnqual(*getContext()); - Entry.Node = Dst; Args.push_back(Entry); - Entry.Node = Src; Args.push_back(Entry); - - Entry.Ty = getDataLayout().getIntPtrType(*getContext()); - Entry.Node = Size; Args.push_back(Entry); + Type *PtrTy = PointerType::getUnqual(*getContext()); + Args.emplace_back(Dst, PtrTy); + Args.emplace_back(Src, PtrTy); + Args.emplace_back(Size, getDataLayout().getIntPtrType(*getContext())); // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); bool IsTailCall = false; @@ -9022,17 +9133,10 @@ SDValue SelectionDAG::getAtomicMemcpy(SDValue Chain, const SDLoc &dl, MachinePointerInfo SrcPtrInfo) { // Emit a library call. TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Ty = getDataLayout().getIntPtrType(*getContext()); - Entry.Node = Dst; - Args.push_back(Entry); - - Entry.Node = Src; - Args.push_back(Entry); - - Entry.Ty = SizeTy; - Entry.Node = Size; - Args.push_back(Entry); + Type *ArgTy = getDataLayout().getIntPtrType(*getContext()); + Args.emplace_back(Dst, ArgTy); + Args.emplace_back(Src, ArgTy); + Args.emplace_back(Size, SizeTy); RTLIB::Libcall LibraryCall = RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(ElemSz); @@ -9095,13 +9199,10 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, // Emit a library call. TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Ty = PointerType::getUnqual(*getContext()); - Entry.Node = Dst; Args.push_back(Entry); - Entry.Node = Src; Args.push_back(Entry); - - Entry.Ty = getDataLayout().getIntPtrType(*getContext()); - Entry.Node = Size; Args.push_back(Entry); + Type *PtrTy = PointerType::getUnqual(*getContext()); + Args.emplace_back(Dst, PtrTy); + Args.emplace_back(Src, PtrTy); + Args.emplace_back(Size, getDataLayout().getIntPtrType(*getContext())); // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); @@ -9139,17 +9240,10 @@ SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl, MachinePointerInfo SrcPtrInfo) { // Emit a library call. TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Ty = getDataLayout().getIntPtrType(*getContext()); - Entry.Node = Dst; - Args.push_back(Entry); - - Entry.Node = Src; - Args.push_back(Entry); - - Entry.Ty = SizeTy; - Entry.Node = Size; - Args.push_back(Entry); + Type *IntPtrTy = getDataLayout().getIntPtrType(*getContext()); + Args.emplace_back(Dst, IntPtrTy); + Args.emplace_back(Src, IntPtrTy); + Args.emplace_back(Size, SizeTy); RTLIB::Libcall LibraryCall = RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElemSz); @@ -9226,28 +9320,20 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, const char *BzeroName = getTargetLoweringInfo().getLibcallName(RTLIB::BZERO); - // Helper function to create an Entry from Node and Type. - const auto CreateEntry = [](SDValue Node, Type *Ty) { - TargetLowering::ArgListEntry Entry; - Entry.Node = Node; - Entry.Ty = Ty; - return Entry; - }; - bool UseBZero = isNullConstant(Src) && BzeroName; // If zeroing out and bzero is present, use it. if (UseBZero) { TargetLowering::ArgListTy Args; - Args.push_back(CreateEntry(Dst, PointerType::getUnqual(Ctx))); - Args.push_back(CreateEntry(Size, DL.getIntPtrType(Ctx))); + Args.emplace_back(Dst, PointerType::getUnqual(Ctx)); + Args.emplace_back(Size, DL.getIntPtrType(Ctx)); CLI.setLibCallee( TLI->getLibcallCallingConv(RTLIB::BZERO), Type::getVoidTy(Ctx), getExternalSymbol(BzeroName, TLI->getPointerTy(DL)), std::move(Args)); } else { TargetLowering::ArgListTy Args; - Args.push_back(CreateEntry(Dst, PointerType::getUnqual(Ctx))); - Args.push_back(CreateEntry(Src, Src.getValueType().getTypeForEVT(Ctx))); - Args.push_back(CreateEntry(Size, DL.getIntPtrType(Ctx))); + Args.emplace_back(Dst, PointerType::getUnqual(Ctx)); + Args.emplace_back(Src, Src.getValueType().getTypeForEVT(Ctx)); + Args.emplace_back(Size, DL.getIntPtrType(Ctx)); CLI.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET), Dst.getValueType().getTypeForEVT(Ctx), getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET), @@ -9276,18 +9362,9 @@ SDValue SelectionDAG::getAtomicMemset(SDValue Chain, const SDLoc &dl, MachinePointerInfo DstPtrInfo) { // Emit a library call. TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Ty = getDataLayout().getIntPtrType(*getContext()); - Entry.Node = Dst; - Args.push_back(Entry); - - Entry.Ty = Type::getInt8Ty(*getContext()); - Entry.Node = Value; - Args.push_back(Entry); - - Entry.Ty = SizeTy; - Entry.Node = Size; - Args.push_back(Entry); + Args.emplace_back(Dst, getDataLayout().getIntPtrType(*getContext())); + Args.emplace_back(Value, Type::getInt8Ty(*getContext())); + Args.emplace_back(Size, SizeTy); RTLIB::Libcall LibraryCall = RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(ElemSz); @@ -12264,8 +12341,6 @@ static bool gluePropagatesDivergence(const SDNode *Node) { } bool SelectionDAG::calculateDivergence(SDNode *N) { - if (!DivergentTarget) - return false; if (TLI->isSDNodeAlwaysUniform(N)) { assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, UA) && "Conflicting divergence information!"); @@ -12285,8 +12360,6 @@ bool SelectionDAG::calculateDivergence(SDNode *N) { } void SelectionDAG::updateDivergence(SDNode *N) { - if (!DivergentTarget) - return; SmallVector<SDNode *, 16> Worklist(1, N); do { N = Worklist.pop_back_val(); @@ -13847,20 +13920,16 @@ void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) { Ops[I].setInitial(Vals[I]); EVT VT = Ops[I].getValueType(); - // Take care of the Node's operands iff target has divergence // Skip Chain. It does not carry divergence. - if (DivergentTarget && VT != MVT::Other && + if (VT != MVT::Other && (VT != MVT::Glue || gluePropagatesDivergence(Ops[I].getNode())) && Ops[I].getNode()->isDivergent()) { - // Node is going to be divergent if at least one of its operand is - // divergent, unless it belongs to the "AlwaysUniform" exemptions. IsDivergent = true; } } Node->NumOperands = Vals.size(); Node->OperandList = Ops; - // Check the divergence of the Node itself. - if (DivergentTarget && !TLI->isSDNodeAlwaysUniform(Node)) { + if (!TLI->isSDNodeAlwaysUniform(Node)) { IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, UA); Node->SDNodeBits.IsDivergent = IsDivergent; } @@ -13950,10 +14019,7 @@ SDValue SelectionDAG::makeStateFunctionCall(unsigned LibFunc, SDValue Ptr, const SDLoc &DLoc) { assert(InChain.getValueType() == MVT::Other && "Expected token chain"); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Node = Ptr; - Entry.Ty = Ptr.getValueType().getTypeForEVT(*getContext()); - Args.push_back(Entry); + Args.emplace_back(Ptr, Ptr.getValueType().getTypeForEVT(*getContext())); RTLIB::Libcall LC = static_cast<RTLIB::Libcall>(LibFunc); SDValue Callee = getExternalSymbol(TLI->getLibcallName(LC), TLI->getPointerTy(getDataLayout())); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index f5f5c14..901f10d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1837,11 +1837,8 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { getValue(CPA->getDiscriminator())); } - if (isa<ConstantPointerNull>(C)) { - unsigned AS = V->getType()->getPointerAddressSpace(); - return DAG.getConstant(0, getCurSDLoc(), - TLI.getPointerTy(DAG.getDataLayout(), AS)); - } + if (isa<ConstantPointerNull>(C)) + return DAG.getConstant(0, getCurSDLoc(), VT); if (match(C, m_VScale())) return DAG.getVScale(getCurSDLoc(), VT, APInt(VT.getSizeInBits(), 1)); @@ -2211,9 +2208,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, Chains); } else if (I.getNumOperands() != 0) { - SmallVector<EVT, 4> ValueVTs; - ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs); - unsigned NumValues = ValueVTs.size(); + SmallVector<Type *, 4> Types; + ComputeValueTypes(DL, I.getOperand(0)->getType(), Types); + unsigned NumValues = Types.size(); if (NumValues) { SDValue RetOp = getValue(I.getOperand(0)); @@ -2233,7 +2230,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { bool RetInReg = F->getAttributes().hasRetAttr(Attribute::InReg); for (unsigned j = 0; j != NumValues; ++j) { - EVT VT = ValueVTs[j]; + EVT VT = TLI.getValueType(DL, Types[j]); if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind); @@ -2273,8 +2270,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { Flags.setNoExt(); for (unsigned i = 0; i < NumParts; ++i) { - Outs.push_back(ISD::OutputArg( - Flags, Parts[i].getValueType().getSimpleVT(), VT, 0, 0)); + Outs.push_back(ISD::OutputArg(Flags, + Parts[i].getValueType().getSimpleVT(), + VT, Types[j], 0, 0)); OutVals.push_back(Parts[i]); } } @@ -2292,6 +2290,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { Flags.setSwiftError(); Outs.push_back(ISD::OutputArg(Flags, /*vt=*/TLI.getPointerTy(DL), /*argvt=*/EVT(TLI.getPointerTy(DL)), + PointerType::getUnqual(*DAG.getContext()), /*origidx=*/1, /*partOffs=*/0)); // Create SDNode for the swifterror virtual register. OutVals.push_back( @@ -3107,9 +3106,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, assert(FnTy->getNumParams() == 1 && "Invalid function signature"); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Node = GuardVal; - Entry.Ty = FnTy->getParamType(0); + TargetLowering::ArgListEntry Entry(GuardVal, FnTy->getParamType(0)); if (GuardCheckFn->hasParamAttribute(0, Attribute::AttrKind::InReg)) Entry.IsInReg = true; Args.push_back(Entry); @@ -3206,9 +3203,7 @@ void SelectionDAGBuilder::visitSPDescriptorFailure( assert(FnTy->getNumParams() == 1 && "Invalid function signature"); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Node = GuardVal; - Entry.Ty = FnTy->getParamType(0); + TargetLowering::ArgListEntry Entry(GuardVal, FnTy->getParamType(0)); if (GuardCheckFn->hasParamAttribute(0, Attribute::AttrKind::InReg)) Entry.IsInReg = true; Args.push_back(Entry); @@ -3578,7 +3573,7 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) { MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB; // Update machine-CFG edges with unique successors. - SmallSet<BasicBlock*, 32> Done; + SmallPtrSet<BasicBlock *, 32> Done; for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) { BasicBlock *BB = I.getSuccessor(i); bool Inserted = Done.insert(BB).second; @@ -3977,6 +3972,11 @@ void SelectionDAGBuilder::visitSIToFP(const User &I) { setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N)); } +void SelectionDAGBuilder::visitPtrToAddr(const User &I) { + // FIXME: this is not correct for pointers with addr width != pointer width + visitPtrToInt(I); +} + void SelectionDAGBuilder::visitPtrToInt(const User &I) { // What to do depends on the size of the integer and the size of the pointer. // We can either truncate, zero extend, or no-op, accordingly. @@ -4902,9 +4902,8 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I, // extract the splat value and use it as a uniform base. // In all other cases the function returns 'false'. static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index, - ISD::MemIndexType &IndexType, SDValue &Scale, - SelectionDAGBuilder *SDB, const BasicBlock *CurBB, - uint64_t ElemSize) { + SDValue &Scale, SelectionDAGBuilder *SDB, + const BasicBlock *CurBB, uint64_t ElemSize) { SelectionDAG& DAG = SDB->DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const DataLayout &DL = DAG.getDataLayout(); @@ -4922,7 +4921,6 @@ static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index, ElementCount NumElts = cast<VectorType>(Ptr->getType())->getElementCount(); EVT VT = EVT::getVectorVT(*DAG.getContext(), TLI.getPointerTy(DL), NumElts); Index = DAG.getConstant(0, SDB->getCurSDLoc(), VT); - IndexType = ISD::SIGNED_SCALED; Scale = DAG.getTargetConstant(1, SDB->getCurSDLoc(), TLI.getPointerTy(DL)); return true; } @@ -4952,7 +4950,6 @@ static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index, Base = SDB->getValue(BasePtr); Index = SDB->getValue(IndexVal); - IndexType = ISD::SIGNED_SCALED; Scale = DAG.getTargetConstant(ScaleVal, SDB->getCurSDLoc(), TLI.getPointerTy(DL)); @@ -4974,9 +4971,8 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) { SDValue Base; SDValue Index; - ISD::MemIndexType IndexType; SDValue Scale; - bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this, + bool UniformBase = getUniformBase(Ptr, Base, Index, Scale, this, I.getParent(), VT.getScalarStoreSize()); unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace(); @@ -4986,8 +4982,8 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) { if (!UniformBase) { Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout())); Index = getValue(Ptr); - IndexType = ISD::SIGNED_SCALED; - Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout())); + Scale = + DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout())); } EVT IdxVT = Index.getValueType(); @@ -4999,7 +4995,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) { SDValue Ops[] = { getMemoryRoot(), Src0, Mask, Base, Index, Scale }; SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl, - Ops, MMO, IndexType, false); + Ops, MMO, ISD::SIGNED_SCALED, false); DAG.setRoot(Scatter); setValue(&I, Scatter); } @@ -5092,9 +5088,8 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { SDValue Root = DAG.getRoot(); SDValue Base; SDValue Index; - ISD::MemIndexType IndexType; SDValue Scale; - bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this, + bool UniformBase = getUniformBase(Ptr, Base, Index, Scale, this, I.getParent(), VT.getScalarStoreSize()); unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( @@ -5105,8 +5100,8 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { if (!UniformBase) { Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout())); Index = getValue(Ptr); - IndexType = ISD::SIGNED_SCALED; - Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout())); + Scale = + DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout())); } EVT IdxVT = Index.getValueType(); @@ -5117,8 +5112,9 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { } SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale }; - SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl, - Ops, MMO, IndexType, ISD::NON_EXTLOAD); + SDValue Gather = + DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl, Ops, MMO, + ISD::SIGNED_SCALED, ISD::NON_EXTLOAD); PendingLoads.push_back(Gather.getValue(1)); setValue(&I, Gather); @@ -6431,9 +6427,8 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I, SDValue Root = DAG.getRoot(); SDValue Base; SDValue Index; - ISD::MemIndexType IndexType; SDValue Scale; - bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this, + bool UniformBase = getUniformBase(Ptr, Base, Index, Scale, this, I.getParent(), VT.getScalarStoreSize()); unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace(); @@ -6446,7 +6441,6 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I, if (!UniformBase) { Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout())); Index = getValue(Ptr); - IndexType = ISD::SIGNED_SCALED; Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout())); } @@ -6462,7 +6456,7 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I, SDValue Ops[] = {Root, Inc, Mask, Base, Index, Scale, ID}; SDValue Histogram = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), VT, sdl, - Ops, MMO, IndexType); + Ops, MMO, ISD::SIGNED_SCALED); setValue(&I, Histogram); DAG.setRoot(Histogram); @@ -7514,10 +7508,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } TargetLowering::ArgListTy Args; if (Intrinsic == Intrinsic::ubsantrap) { - Args.push_back(TargetLoweringBase::ArgListEntry()); - Args[0].Val = I.getArgOperand(0); - Args[0].Node = getValue(Args[0].Val); - Args[0].Ty = Args[0].Val->getType(); + Value *Arg = I.getArgOperand(0); + Args.emplace_back(Arg, getValue(Arg)); } TargetLowering::CallLoweringInfo CLI(DAG); @@ -7597,7 +7589,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, if (TM.getOptLevel() == CodeGenOptLevel::None) return; - const AllocaInst *LifetimeObject = dyn_cast<AllocaInst>(I.getArgOperand(1)); + const AllocaInst *LifetimeObject = dyn_cast<AllocaInst>(I.getArgOperand(0)); if (!LifetimeObject) return; @@ -7946,9 +7938,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, Args.reserve(3); for (unsigned Idx : {2, 3, 1}) { - TargetLowering::ArgListEntry Arg; - Arg.Node = getValue(I.getOperand(Idx)); - Arg.Ty = I.getOperand(Idx)->getType(); + TargetLowering::ArgListEntry Arg(getValue(I.getOperand(Idx)), + I.getOperand(Idx)->getType()); Arg.setAttributes(&I, Idx); Args.push_back(Arg); } @@ -7959,9 +7950,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, // Forward the flags and any additional arguments. for (unsigned Idx = 4; Idx < I.arg_size(); ++Idx) { - TargetLowering::ArgListEntry Arg; - Arg.Node = getValue(I.getOperand(Idx)); - Arg.Ty = I.getOperand(Idx)->getType(); + TargetLowering::ArgListEntry Arg(getValue(I.getOperand(Idx)), + I.getOperand(Idx)->getType()); Arg.setAttributes(&I, Idx); Args.push_back(Arg); } @@ -7983,6 +7973,42 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, HasTailCall = true; return; } + case Intrinsic::amdgcn_call_whole_wave: { + TargetLowering::ArgListTy Args; + + // The first argument is the callee. Skip it when assembling the call args. + for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) { + TargetLowering::ArgListEntry Arg(getValue(I.getArgOperand(Idx)), + I.getArgOperand(Idx)->getType()); + Arg.setAttributes(&I, Idx); + Args.push_back(Arg); + } + + SDValue ConvControlToken; + if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) { + auto *Token = Bundle->Inputs[0].get(); + ConvControlToken = getValue(Token); + } + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(getCurSDLoc()) + .setChain(getRoot()) + .setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(), + getValue(I.getArgOperand(0)), std::move(Args)) + .setTailCall(false) + .setIsPreallocated( + I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0) + .setConvergent(I.isConvergent()) + .setConvergenceControlToken(ConvControlToken); + CLI.CB = &I; + + std::pair<SDValue, SDValue> Result = + lowerInvokable(CLI, /*EHPadBB=*/nullptr); + + if (Result.first.getNode()) + setValue(&I, Result.first); + return; + } case Intrinsic::ptrmask: { SDValue Ptr = getValue(I.getOperand(0)); SDValue Mask = getValue(I.getOperand(1)); @@ -8487,14 +8513,12 @@ void SelectionDAGBuilder::visitVPGather( MachinePointerInfo(AS), MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges); SDValue Base, Index, Scale; - ISD::MemIndexType IndexType; - bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale, - this, VPIntrin.getParent(), - VT.getScalarStoreSize()); + bool UniformBase = + getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(), + VT.getScalarStoreSize()); if (!UniformBase) { Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout())); Index = getValue(PtrOperand); - IndexType = ISD::SIGNED_SCALED; Scale = DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())); } EVT IdxVT = Index.getValueType(); @@ -8506,7 +8530,7 @@ void SelectionDAGBuilder::visitVPGather( LD = DAG.getGatherVP( DAG.getVTList(VT, MVT::Other), VT, DL, {DAG.getRoot(), Base, Index, Scale, OpValues[1], OpValues[2]}, MMO, - IndexType); + ISD::SIGNED_SCALED); PendingLoads.push_back(LD.getValue(1)); setValue(&VPIntrin, LD); } @@ -8550,16 +8574,13 @@ void SelectionDAGBuilder::visitVPScatter( MachinePointerInfo(AS), MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo); SDValue Base, Index, Scale; - ISD::MemIndexType IndexType; - bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale, - this, VPIntrin.getParent(), - VT.getScalarStoreSize()); + bool UniformBase = + getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(), + VT.getScalarStoreSize()); if (!UniformBase) { Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout())); Index = getValue(PtrOperand); - IndexType = ISD::SIGNED_SCALED; - Scale = - DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())); + Scale = DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())); } EVT IdxVT = Index.getValueType(); EVT EltTy = IdxVT.getVectorElementType(); @@ -8570,7 +8591,7 @@ void SelectionDAGBuilder::visitVPScatter( ST = DAG.getScatterVP(DAG.getVTList(MVT::Other), VT, DL, {getMemoryRoot(), OpValues[0], Base, Index, Scale, OpValues[2], OpValues[3]}, - MMO, IndexType); + MMO, ISD::SIGNED_SCALED); DAG.setRoot(ST); setValue(&VPIntrin, ST); } @@ -8912,7 +8933,6 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, } for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) { - TargetLowering::ArgListEntry Entry; const Value *V = *I; // Skip empty types @@ -8920,8 +8940,7 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, continue; SDValue ArgNode = getValue(V); - Entry.Node = ArgNode; Entry.Ty = V->getType(); - + TargetLowering::ArgListEntry Entry(ArgNode, V->getType()); Entry.setAttributes(&CB, I - CB.arg_begin()); // Use swifterror virtual register as input to the call. @@ -8945,11 +8964,8 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, // If call site has a cfguardtarget operand bundle, create and add an // additional ArgListEntry. if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_cfguardtarget)) { - TargetLowering::ArgListEntry Entry; Value *V = Bundle->Inputs[0]; - SDValue ArgNode = getValue(V); - Entry.Node = ArgNode; - Entry.Ty = V->getType(); + TargetLowering::ArgListEntry Entry(V, getValue(V)); Entry.IsCFGuardTarget = true; Args.push_back(Entry); } @@ -10612,9 +10628,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo( assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic."); - TargetLowering::ArgListEntry Entry; - Entry.Node = getValue(V); - Entry.Ty = V->getType(); + TargetLowering::ArgListEntry Entry(getValue(V), V->getType()); Entry.setAttributes(Call, ArgI); Args.push_back(Entry); } @@ -10974,27 +10988,42 @@ static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { /// migrated to using LowerCall, this hook should be integrated into SDISel. std::pair<SDValue, SDValue> TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { + LLVMContext &Context = CLI.RetTy->getContext(); + // Handle the incoming return values from the call. CLI.Ins.clear(); - SmallVector<EVT, 4> RetTys; + SmallVector<Type *, 4> RetOrigTys; SmallVector<TypeSize, 4> Offsets; auto &DL = CLI.DAG.getDataLayout(); - ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets); + ComputeValueTypes(DL, CLI.OrigRetTy, RetOrigTys, &Offsets); + + SmallVector<EVT, 4> RetVTs; + if (CLI.RetTy != CLI.OrigRetTy) { + assert(RetOrigTys.size() == 1 && + "Only supported for non-aggregate returns"); + RetVTs.push_back(getValueType(DL, CLI.RetTy)); + } else { + for (Type *Ty : RetOrigTys) + RetVTs.push_back(getValueType(DL, Ty)); + } if (CLI.IsPostTypeLegalization) { // If we are lowering a libcall after legalization, split the return type. - SmallVector<EVT, 4> OldRetTys; + SmallVector<Type *, 4> OldRetOrigTys; + SmallVector<EVT, 4> OldRetVTs; SmallVector<TypeSize, 4> OldOffsets; - RetTys.swap(OldRetTys); + RetOrigTys.swap(OldRetOrigTys); + RetVTs.swap(OldRetVTs); Offsets.swap(OldOffsets); - for (size_t i = 0, e = OldRetTys.size(); i != e; ++i) { - EVT RetVT = OldRetTys[i]; + for (size_t i = 0, e = OldRetVTs.size(); i != e; ++i) { + EVT RetVT = OldRetVTs[i]; uint64_t Offset = OldOffsets[i]; - MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), RetVT); - unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), RetVT); + MVT RegisterVT = getRegisterType(Context, RetVT); + unsigned NumRegs = getNumRegisters(Context, RetVT); unsigned RegisterVTByteSZ = RegisterVT.getSizeInBits() / 8; - RetTys.append(NumRegs, RegisterVT); + RetOrigTys.append(NumRegs, OldRetOrigTys[i]); + RetVTs.append(NumRegs, RegisterVT); for (unsigned j = 0; j != NumRegs; ++j) Offsets.push_back(TypeSize::getFixed(Offset + j * RegisterVTByteSZ)); } @@ -11005,7 +11034,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { bool CanLowerReturn = this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(), - CLI.IsVarArg, Outs, CLI.RetTy->getContext(), CLI.RetTy); + CLI.IsVarArg, Outs, Context, CLI.RetTy); SDValue DemoteStackSlot; int DemoteStackIdx = -100; @@ -11018,30 +11047,16 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { MachineFunction &MF = CLI.DAG.getMachineFunction(); DemoteStackIdx = MF.getFrameInfo().CreateStackObject(TySize, Alignment, false); - Type *StackSlotPtrType = - PointerType::get(CLI.RetTy->getContext(), DL.getAllocaAddrSpace()); + Type *StackSlotPtrType = PointerType::get(Context, DL.getAllocaAddrSpace()); DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getFrameIndexTy(DL)); - ArgListEntry Entry; - Entry.Node = DemoteStackSlot; - Entry.Ty = StackSlotPtrType; - Entry.IsSExt = false; - Entry.IsZExt = false; - Entry.IsInReg = false; + ArgListEntry Entry(DemoteStackSlot, StackSlotPtrType); Entry.IsSRet = true; - Entry.IsNest = false; - Entry.IsByVal = false; - Entry.IsByRef = false; - Entry.IsReturned = false; - Entry.IsSwiftSelf = false; - Entry.IsSwiftAsync = false; - Entry.IsSwiftError = false; - Entry.IsCFGuardTarget = false; Entry.Alignment = Alignment; CLI.getArgs().insert(CLI.getArgs().begin(), Entry); CLI.NumFixedArgs += 1; CLI.getArgs()[0].IndirectType = CLI.RetTy; - CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext()); + CLI.RetTy = CLI.OrigRetTy = Type::getVoidTy(Context); // sret demotion isn't compatible with tail-calls, since the sret argument // points into the callers stack frame. @@ -11049,36 +11064,32 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { } else { bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters( CLI.RetTy, CLI.CallConv, CLI.IsVarArg, DL); - for (unsigned I = 0, E = RetTys.size(); I != E; ++I) { + for (unsigned I = 0, E = RetVTs.size(); I != E; ++I) { ISD::ArgFlagsTy Flags; if (NeedsRegBlock) { Flags.setInConsecutiveRegs(); - if (I == RetTys.size() - 1) + if (I == RetVTs.size() - 1) Flags.setInConsecutiveRegsLast(); } - EVT VT = RetTys[I]; - MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(), - CLI.CallConv, VT); - unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(), - CLI.CallConv, VT); + EVT VT = RetVTs[I]; + MVT RegisterVT = getRegisterTypeForCallingConv(Context, CLI.CallConv, VT); + unsigned NumRegs = + getNumRegistersForCallingConv(Context, CLI.CallConv, VT); for (unsigned i = 0; i != NumRegs; ++i) { - ISD::InputArg MyFlags; - MyFlags.Flags = Flags; - MyFlags.VT = RegisterVT; - MyFlags.ArgVT = VT; - MyFlags.Used = CLI.IsReturnValueUsed; + ISD::InputArg Ret(Flags, RegisterVT, VT, RetOrigTys[I], + CLI.IsReturnValueUsed, ISD::InputArg::NoArgIndex, 0); if (CLI.RetTy->isPointerTy()) { - MyFlags.Flags.setPointer(); - MyFlags.Flags.setPointerAddrSpace( + Ret.Flags.setPointer(); + Ret.Flags.setPointerAddrSpace( cast<PointerType>(CLI.RetTy)->getAddressSpace()); } if (CLI.RetSExt) - MyFlags.Flags.setSExt(); + Ret.Flags.setSExt(); if (CLI.RetZExt) - MyFlags.Flags.setZExt(); + Ret.Flags.setZExt(); if (CLI.IsInReg) - MyFlags.Flags.setInReg(); - CLI.Ins.push_back(MyFlags); + Ret.Flags.setInReg(); + CLI.Ins.push_back(Ret); } } } @@ -11088,11 +11099,12 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { if (supportSwiftError()) { for (const ArgListEntry &Arg : Args) { if (Arg.IsSwiftError) { - ISD::InputArg MyFlags; - MyFlags.VT = getPointerTy(DL); - MyFlags.ArgVT = EVT(getPointerTy(DL)); - MyFlags.Flags.setSwiftError(); - CLI.Ins.push_back(MyFlags); + ISD::ArgFlagsTy Flags; + Flags.setSwiftError(); + ISD::InputArg Ret(Flags, getPointerTy(DL), EVT(getPointerTy(DL)), + PointerType::getUnqual(Context), + /*Used=*/true, ISD::InputArg::NoArgIndex, 0); + CLI.Ins.push_back(Ret); } } } @@ -11101,18 +11113,24 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { CLI.Outs.clear(); CLI.OutVals.clear(); for (unsigned i = 0, e = Args.size(); i != e; ++i) { - SmallVector<EVT, 4> ValueVTs; - ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs); + SmallVector<Type *, 4> OrigArgTys; + ComputeValueTypes(DL, Args[i].OrigTy, OrigArgTys); // FIXME: Split arguments if CLI.IsPostTypeLegalization Type *FinalType = Args[i].Ty; if (Args[i].IsByVal) FinalType = Args[i].IndirectType; bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters( FinalType, CLI.CallConv, CLI.IsVarArg, DL); - for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues; + for (unsigned Value = 0, NumValues = OrigArgTys.size(); Value != NumValues; ++Value) { - EVT VT = ValueVTs[Value]; - Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext()); + Type *OrigArgTy = OrigArgTys[Value]; + Type *ArgTy = OrigArgTy; + if (Args[i].Ty != Args[i].OrigTy) { + assert(Value == 0 && "Only supported for non-aggregate arguments"); + ArgTy = Args[i].Ty; + } + + EVT VT = getValueType(DL, ArgTy); SDValue Op = SDValue(Args[i].Node.getNode(), Args[i].Node.getResNo() + Value); ISD::ArgFlagsTy Flags; @@ -11125,10 +11143,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { if (i >= CLI.NumFixedArgs) Flags.setVarArg(); - if (Args[i].Ty->isPointerTy()) { + if (ArgTy->isPointerTy()) { Flags.setPointer(); - Flags.setPointerAddrSpace( - cast<PointerType>(Args[i].Ty)->getAddressSpace()); + Flags.setPointerAddrSpace(cast<PointerType>(ArgTy)->getAddressSpace()); } if (Args[i].IsZExt) Flags.setZExt(); @@ -11202,10 +11219,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { if (NeedsRegBlock) Flags.setInConsecutiveRegs(); - MVT PartVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(), - CLI.CallConv, VT); - unsigned NumParts = getNumRegistersForCallingConv(CLI.RetTy->getContext(), - CLI.CallConv, VT); + MVT PartVT = getRegisterTypeForCallingConv(Context, CLI.CallConv, VT); + unsigned NumParts = + getNumRegistersForCallingConv(Context, CLI.CallConv, VT); SmallVector<SDValue, 4> Parts(NumParts); ISD::NodeType ExtendKind = ISD::ANY_EXTEND; @@ -11222,7 +11238,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { (CLI.RetTy->isPointerTy() && Args[i].Ty->isPointerTy() && CLI.RetTy->getPointerAddressSpace() == Args[i].Ty->getPointerAddressSpace())) && - RetTys.size() == NumValues && "unexpected use of 'returned'"); + RetVTs.size() == NumValues && "unexpected use of 'returned'"); // Before passing 'returned' to the target lowering code, ensure that // either the register MVT and the actual EVT are the same size or that // the return value and argument are extended in the same way; in these @@ -11247,7 +11263,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // For scalable vectors the scalable part is currently handled // by individual targets, so we just use the known minimum size here. ISD::OutputArg MyFlags( - Flags, Parts[j].getValueType().getSimpleVT(), VT, i, + Flags, Parts[j].getValueType().getSimpleVT(), VT, OrigArgTy, i, j * Parts[j].getValueType().getStoreSize().getKnownMinValue()); if (NumParts > 1 && j == 0) MyFlags.Flags.setSplit(); @@ -11303,7 +11319,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // hidden sret parameter. MVT PtrVT = getPointerTy(DL, DL.getAllocaAddrSpace()); - unsigned NumValues = RetTys.size(); + unsigned NumValues = RetVTs.size(); ReturnValues.resize(NumValues); SmallVector<SDValue, 4> Chains(NumValues); @@ -11316,7 +11332,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { DemoteStackSlot, CLI.DAG.getConstant(Offsets[i], CLI.DL, PtrVT), CLI.DL, SDNodeFlags::NoUnsignedWrap); SDValue L = CLI.DAG.getLoad( - RetTys[i], CLI.DL, CLI.Chain, Add, + RetVTs[i], CLI.DL, CLI.Chain, Add, MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(), DemoteStackIdx, Offsets[i]), HiddenSRetAlign); @@ -11334,11 +11350,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { else if (CLI.RetZExt) AssertOp = ISD::AssertZext; unsigned CurReg = 0; - for (EVT VT : RetTys) { - MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(), - CLI.CallConv, VT); - unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(), - CLI.CallConv, VT); + for (EVT VT : RetVTs) { + MVT RegisterVT = getRegisterTypeForCallingConv(Context, CLI.CallConv, VT); + unsigned NumRegs = + getNumRegistersForCallingConv(Context, CLI.CallConv, VT); ReturnValues.push_back(getCopyFromParts( CLI.DAG, CLI.DL, &InVals[CurReg], NumRegs, RegisterVT, VT, nullptr, @@ -11354,7 +11369,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { } SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL, - CLI.DAG.getVTList(RetTys), ReturnValues); + CLI.DAG.getVTList(RetVTs), ReturnValues); return std::make_pair(Res, CLI.Chain); } @@ -11625,7 +11640,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) { ISD::ArgFlagsTy Flags; Flags.setSRet(); MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVT); - ISD::InputArg RetArg(Flags, RegisterVT, ValueVT, true, + ISD::InputArg RetArg(Flags, RegisterVT, ValueVT, F.getReturnType(), true, ISD::InputArg::NoArgIndex, 0); Ins.push_back(RetArg); } @@ -11640,8 +11655,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Set up the incoming argument description vector. for (const Argument &Arg : F.args()) { unsigned ArgNo = Arg.getArgNo(); - SmallVector<EVT, 4> ValueVTs; - ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs); + SmallVector<Type *, 4> Types; + ComputeValueTypes(DAG.getDataLayout(), Arg.getType(), Types); bool isArgValueUsed = !Arg.use_empty(); unsigned PartBase = 0; Type *FinalType = Arg.getType(); @@ -11649,17 +11664,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) { FinalType = Arg.getParamByValType(); bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters( FinalType, F.getCallingConv(), F.isVarArg(), DL); - for (unsigned Value = 0, NumValues = ValueVTs.size(); - Value != NumValues; ++Value) { - EVT VT = ValueVTs[Value]; - Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); + for (unsigned Value = 0, NumValues = Types.size(); Value != NumValues; + ++Value) { + Type *ArgTy = Types[Value]; + EVT VT = TLI->getValueType(DL, ArgTy); ISD::ArgFlagsTy Flags; - - if (Arg.getType()->isPointerTy()) { + if (ArgTy->isPointerTy()) { Flags.setPointer(); - Flags.setPointerAddrSpace( - cast<PointerType>(Arg.getType())->getAddressSpace()); + Flags.setPointerAddrSpace(cast<PointerType>(ArgTy)->getAddressSpace()); } if (Arg.hasAttribute(Attribute::ZExt)) Flags.setZExt(); @@ -11763,7 +11776,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // are responsible for handling scalable vector arguments and // return values. ISD::InputArg MyFlags( - Flags, RegisterVT, VT, isArgValueUsed, ArgNo, + Flags, RegisterVT, VT, ArgTy, isArgValueUsed, ArgNo, PartBase + i * RegisterVT.getStoreSize().getKnownMinValue()); if (NumRegs > 1 && i == 0) MyFlags.Flags.setSplit(); @@ -12737,17 +12750,22 @@ static Register FollowCopyChain(MachineRegisterInfo &MRI, Register Reg) { assert(MI->getOpcode() == TargetOpcode::COPY && "start of copy chain MUST be COPY"); Reg = MI->getOperand(1).getReg(); + + // If the copied register in the first copy must be virtual. + assert(Reg.isVirtual() && "expected COPY of virtual register"); MI = MRI.def_begin(Reg)->getParent(); + // There may be an optional second copy. if (MI->getOpcode() == TargetOpcode::COPY) { assert(Reg.isVirtual() && "expected COPY of virtual register"); Reg = MI->getOperand(1).getReg(); assert(Reg.isPhysical() && "expected COPY of physical register"); - MI = MRI.def_begin(Reg)->getParent(); + } else { + // The start of the chain must be an INLINEASM_BR. + assert(MI->getOpcode() == TargetOpcode::INLINEASM_BR && + "end of copy chain MUST be INLINEASM_BR"); } - // The start of the chain must be an INLINEASM_BR. - assert(MI->getOpcode() == TargetOpcode::INLINEASM_BR && - "end of copy chain MUST be INLINEASM_BR"); + return Reg; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index c251755..e0835e6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -574,6 +574,7 @@ private: void visitFPToSI(const User &I); void visitUIToFP(const User &I); void visitSIToFP(const User &I); + void visitPtrToAddr(const User &I); void visitPtrToInt(const User &I); void visitIntToPtr(const User &I); void visitBitCast(const User &I); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 26071ed..ece50ed 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -480,10 +480,7 @@ void SelectionDAGISel::initializeAnalysisResults( MachineModuleInfo &MMI = MAMP.getCachedResult<MachineModuleAnalysis>(*Fn.getParent())->getMMI(); - TTI = &FAM.getResult<TargetIRAnalysis>(Fn); - - CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs, - TTI->hasBranchDivergence(&Fn)); + CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs); // Now get the optional analyzes if we want to. // This is based on the possibly changed OptLevel (after optnone is taken @@ -501,6 +498,10 @@ void SelectionDAGISel::initializeAnalysisResults( BatchAA = std::nullopt; SP = &FAM.getResult<SSPLayoutAnalysis>(Fn); + +#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS + TTI = &FAM.getResult<TargetIRAnalysis>(Fn); +#endif } void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) { @@ -536,10 +537,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) { MachineModuleInfo &MMI = MFP.getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); - TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn); - - CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs, - TTI->hasBranchDivergence(&Fn)); + CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs); // Now get the optional analyzes if we want to. // This is based on the possibly changed OptLevel (after optnone is taken @@ -558,6 +556,10 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) { BatchAA = std::nullopt; SP = &MFP.getAnalysis<StackProtector>().getLayoutInfo(); + +#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS + TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn); +#endif } bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index 80aeefe..46a5e44 100644 --- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -1258,7 +1258,7 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) { if (Record.type == RecordType::Spill) { unsigned Index = Record.payload.FI; - SDValue SpillSlot = DAG.getTargetFrameIndex(Index, getFrameIndexTy()); + SDValue SpillSlot = DAG.getFrameIndex(Index, getFrameIndexTy()); // All the reloads are independent and are reading memory only modified by // statepoints (i.e. no other aliasing stores); informing SelectionDAG of diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index e235d14..402a012 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -162,14 +162,17 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, TargetLowering::ArgListTy Args; Args.reserve(Ops.size()); - TargetLowering::ArgListEntry Entry; ArrayRef<Type *> OpsTypeOverrides = CallOptions.OpsTypeOverrides; for (unsigned i = 0; i < Ops.size(); ++i) { SDValue NewOp = Ops[i]; - Entry.Node = NewOp; - Entry.Ty = i < OpsTypeOverrides.size() && OpsTypeOverrides[i] + Type *Ty = i < OpsTypeOverrides.size() && OpsTypeOverrides[i] ? OpsTypeOverrides[i] - : Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); + : NewOp.getValueType().getTypeForEVT(*DAG.getContext()); + TargetLowering::ArgListEntry Entry(NewOp, Ty); + if (CallOptions.IsSoften) + Entry.OrigTy = + CallOptions.OpsVTBeforeSoften[i].getTypeForEVT(*DAG.getContext()); + Entry.IsSExt = shouldSignExtendTypeInLibCall(Entry.Ty, CallOptions.IsSigned); Entry.IsZExt = !Entry.IsSExt; @@ -189,18 +192,21 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); + Type *OrigRetTy = RetTy; TargetLowering::CallLoweringInfo CLI(DAG); bool signExtend = shouldSignExtendTypeInLibCall(RetTy, CallOptions.IsSigned); bool zeroExtend = !signExtend; - if (CallOptions.IsSoften && - !shouldExtendTypeInLibCall(CallOptions.RetVTBeforeSoften)) { - signExtend = zeroExtend = false; + if (CallOptions.IsSoften) { + OrigRetTy = CallOptions.RetVTBeforeSoften.getTypeForEVT(*DAG.getContext()); + if (!shouldExtendTypeInLibCall(CallOptions.RetVTBeforeSoften)) + signExtend = zeroExtend = false; } CLI.setDebugLoc(dl) .setChain(InChain) - .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) + .setLibCallee(getLibcallCallingConv(LC), RetTy, OrigRetTy, Callee, + std::move(Args)) .setNoReturn(CallOptions.DoesNotReturn) .setDiscardResult(!CallOptions.IsReturnValueUsed) .setIsPostTypeLegalization(CallOptions.IsPostTypeLegalization) @@ -420,7 +426,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT, TargetLowering::MakeLibCallOptions CallOptions; EVT OpsVT[2] = { OldLHS.getValueType(), OldRHS.getValueType() }; - CallOptions.setTypeListBeforeSoften(OpsVT, RetVT, true); + CallOptions.setTypeListBeforeSoften(OpsVT, RetVT); auto Call = makeLibCall(DAG, LC1, RetVT, Ops, CallOptions, dl, Chain); NewLHS = Call.first; NewRHS = DAG.getConstant(0, dl, RetVT); @@ -5125,10 +5131,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, !ISD::isUnsignedIntSetCC(Cond))) && isTypeDesirableForOp(ISD::SETCC, N0.getOperand(0).getValueType())) { EVT NewVT = N0.getOperand(0).getValueType(); - SDValue NewConst = DAG.getConstant(ISD::isSignedIntSetCC(Cond) - ? C1.sext(NewVT.getSizeInBits()) - : C1.zext(NewVT.getSizeInBits()), - dl, NewVT); + SDValue NewConst = DAG.getConstant( + (N0->getFlags().hasNoSignedWrap() && !ISD::isUnsignedIntSetCC(Cond)) + ? C1.sext(NewVT.getSizeInBits()) + : C1.zext(NewVT.getSizeInBits()), + dl, NewVT); return DAG.getSetCC(dl, VT, N0.getOperand(0), NewConst, Cond); } @@ -10712,7 +10719,6 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SDLoc dl(GA); ArgListTy Args; - ArgListEntry Entry; const GlobalValue *GV = cast<GlobalValue>(GA->getGlobal()->stripPointerCastsAndAliases()); SmallString<32> NameString("__emutls_v."); @@ -10721,9 +10727,7 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, const GlobalVariable *EmuTlsVar = GV->getParent()->getNamedGlobal(EmuTlsVarName); assert(EmuTlsVar && "Cannot find EmuTlsVar "); - Entry.Node = DAG.getGlobalAddress(EmuTlsVar, dl, PtrVT); - Entry.Ty = VoidPtrType; - Args.push_back(Entry); + Args.emplace_back(DAG.getGlobalAddress(EmuTlsVar, dl, PtrVT), VoidPtrType); SDValue EmuTlsGetAddr = DAG.getExternalSymbol("__emutls_get_address", PtrVT); diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp index 41e956c..938f2d7 100644 --- a/llvm/lib/CodeGen/ShrinkWrap.cpp +++ b/llvm/lib/CodeGen/ShrinkWrap.cpp @@ -967,8 +967,14 @@ bool ShrinkWrapImpl::run(MachineFunction &MF) { << "\nRestore: " << printMBBReference(*Restore) << '\n'); MachineFrameInfo &MFI = MF.getFrameInfo(); - MFI.setSavePoint(Save); - MFI.setRestorePoint(Restore); + SmallVector<MachineBasicBlock *, 4> SavePoints; + SmallVector<MachineBasicBlock *, 4> RestorePoints; + if (Save) { + SavePoints.push_back(Save); + RestorePoints.push_back(Restore); + } + MFI.setSavePoints(SavePoints); + MFI.setRestorePoints(RestorePoints); ++NumCandidates; return Changed; } diff --git a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp index decffdc..ff4b568 100644 --- a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp +++ b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp @@ -179,7 +179,7 @@ void SwiftErrorValueTracking::propagateVRegs() { // Check whether we have a single vreg def from all predecessors. // Otherwise we need a phi. SmallVector<std::pair<MachineBasicBlock *, Register>, 4> VRegs; - SmallSet<const MachineBasicBlock *, 8> Visited; + SmallPtrSet<const MachineBasicBlock *, 8> Visited; for (auto *Pred : MBB->predecessors()) { if (!Visited.insert(Pred).second) continue; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index bf4c9f9..350948a 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1738,13 +1738,13 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl<ISD::OutputArg> &Outs, const TargetLowering &TLI, const DataLayout &DL) { - SmallVector<EVT, 4> ValueVTs; - ComputeValueVTs(TLI, DL, ReturnType, ValueVTs); - unsigned NumValues = ValueVTs.size(); + SmallVector<Type *, 4> Types; + ComputeValueTypes(DL, ReturnType, Types); + unsigned NumValues = Types.size(); if (NumValues == 0) return; - for (unsigned j = 0, f = NumValues; j != f; ++j) { - EVT VT = ValueVTs[j]; + for (Type *Ty : Types) { + EVT VT = TLI.getValueType(DL, Ty); ISD::NodeType ExtendKind = ISD::ANY_EXTEND; if (attr.hasRetAttr(Attribute::SExt)) @@ -1772,7 +1772,7 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType, Flags.setZExt(); for (unsigned i = 0; i < NumParts; ++i) - Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, 0, 0)); + Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, Ty, 0, 0)); } } @@ -1893,6 +1893,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const { case SIToFP: return ISD::SINT_TO_FP; case FPTrunc: return ISD::FP_ROUND; case FPExt: return ISD::FP_EXTEND; + case PtrToAddr: return ISD::BITCAST; case PtrToInt: return ISD::BITCAST; case IntToPtr: return ISD::BITCAST; case BitCast: return ISD::BITCAST; @@ -1923,6 +1924,8 @@ int TargetLoweringBase::IntrinsicIDToISD(Intrinsic::ID ID) const { return ISD::FEXP; case Intrinsic::exp2: return ISD::FEXP2; + case Intrinsic::log: + return ISD::FLOG; default: return ISD::DELETED_NODE; } diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp index 99ba893..972bd8f 100644 --- a/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/llvm/lib/CodeGen/VirtRegMap.cpp @@ -99,7 +99,7 @@ unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) { // Set preferred alignment if we are still able to realign the stack auto &ST = MF->getSubtarget(); Align CurrentAlign = ST.getFrameLowering()->getStackAlign(); - if (Alignment > CurrentAlign && !ST.getRegisterInfo()->canRealignStack(*MF)) { + if (Alignment > CurrentAlign && !TRI->canRealignStack(*MF)) { Alignment = CurrentAlign; } int SS = MF->getFrameInfo().CreateSpillStackObject(Size, Alignment); diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp index 80b4185..0df9137 100644 --- a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp @@ -275,7 +275,8 @@ void LVBinaryReader::mapVirtualAddress(const object::COFFObjectFile &COFFObj) { } Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple, - StringRef TheFeatures) { + StringRef TheFeatures, + StringRef TheCPU) { std::string TargetLookupError; const Target *TheTarget = TargetRegistry::lookupTarget(TheTriple, TargetLookupError); @@ -298,9 +299,8 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple, MAI.reset(AsmInfo); // Target subtargets. - StringRef CPU; MCSubtargetInfo *SubtargetInfo( - TheTarget->createMCSubtargetInfo(TheTriple, CPU, TheFeatures)); + TheTarget->createMCSubtargetInfo(TheTriple, TheCPU, TheFeatures)); if (!SubtargetInfo) return createStringError(errc::invalid_argument, "no subtarget info for target " + TheTriple); diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp index e589551..2ff7081 100644 --- a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp @@ -1190,7 +1190,12 @@ Error LVCodeViewReader::loadTargetInfo(const ObjectFile &Obj) { FeaturesValue = SubtargetFeatures(); } FeaturesValue = *Features; - return loadGenericTargetInfo(TT.str(), FeaturesValue.getString()); + + StringRef CPU; + if (auto OptCPU = Obj.tryGetCPUName()) + CPU = *OptCPU; + + return loadGenericTargetInfo(TT.str(), FeaturesValue.getString(), CPU); } Error LVCodeViewReader::loadTargetInfo(const PDBFile &Pdb) { @@ -1200,8 +1205,9 @@ Error LVCodeViewReader::loadTargetInfo(const PDBFile &Pdb) { TT.setOS(Triple::Win32); StringRef TheFeature = ""; + StringRef TheCPU = ""; - return loadGenericTargetInfo(TT.str(), TheFeature); + return loadGenericTargetInfo(TT.str(), TheFeature, TheCPU); } std::string LVCodeViewReader::getRegisterName(LVSmall Opcode, diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp index 696e2bc..62134df 100644 --- a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp @@ -956,10 +956,7 @@ LVElement *LVDWARFReader::getElementForOffset(LVOffset Offset, Error LVDWARFReader::loadTargetInfo(const ObjectFile &Obj) { // Detect the architecture from the object file. We usually don't need OS // info to lookup a target and create register info. - Triple TT; - TT.setArch(Triple::ArchType(Obj.getArch())); - TT.setVendor(Triple::UnknownVendor); - TT.setOS(Triple::UnknownOS); + Triple TT = Obj.makeTriple(); // Features to be passed to target/subtarget Expected<SubtargetFeatures> Features = Obj.getFeatures(); @@ -969,7 +966,12 @@ Error LVDWARFReader::loadTargetInfo(const ObjectFile &Obj) { FeaturesValue = SubtargetFeatures(); } FeaturesValue = *Features; - return loadGenericTargetInfo(TT.str(), FeaturesValue.getString()); + + StringRef CPU; + if (auto OptCPU = Obj.tryGetCPUName()) + CPU = *OptCPU; + + return loadGenericTargetInfo(TT.str(), FeaturesValue.getString(), CPU); } void LVDWARFReader::mapRangeAddress(const ObjectFile &Obj) { diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp index 1bafed7..ba27aa87 100644 --- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp @@ -64,7 +64,7 @@ public: LLVM_DEBUG({ dbgs() << " Preserving debug section " << Sec.getName() << "\n"; }); - SmallSet<Block *, 8> PreservedBlocks; + SmallPtrSet<Block *, 8> PreservedBlocks; for (auto *Sym : Sec.symbols()) { bool NewPreservedBlock = PreservedBlocks.insert(&Sym->getBlock()).second; diff --git a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp index 8e4937d..91a3115 100644 --- a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp @@ -169,7 +169,7 @@ Error EPCIndirectStubsManager::createStubs(const StubInitsMap &StubInits) { std::vector<tpctypes::UInt64Write> PtrUpdates; for (auto &SI : StubInits) PtrUpdates.push_back({(*AvailableStubInfos)[ASIdx++].PointerAddress, - static_cast<uint64_t>(SI.second.first.getValue())}); + SI.second.first.getValue()}); return MemAccess.writeUInt64s(PtrUpdates); } default: diff --git a/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp b/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp index 19c000e..d460cf6 100644 --- a/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp @@ -14,40 +14,39 @@ namespace llvm { namespace orc { -ThreadSafeModule cloneToContext(const ThreadSafeModule &TSM, - ThreadSafeContext TSCtx, - GVPredicate ShouldCloneDef, - GVModifier UpdateClonedDefSource) { - assert(TSM && "Can not clone null module"); - - if (!ShouldCloneDef) - ShouldCloneDef = [](const GlobalValue &) { return true; }; - - // First copy the source module into a buffer. +static std::pair<std::string, SmallVector<char, 1>> +serializeModule(const Module &M, GVPredicate ShouldCloneDef, + GVModifier UpdateClonedDefSource) { std::string ModuleName; SmallVector<char, 1> ClonedModuleBuffer; - TSM.withModuleDo([&](Module &M) { - ModuleName = M.getModuleIdentifier(); - std::set<GlobalValue *> ClonedDefsInSrc; - ValueToValueMapTy VMap; - auto Tmp = CloneModule(M, VMap, [&](const GlobalValue *GV) { - if (ShouldCloneDef(*GV)) { - ClonedDefsInSrc.insert(const_cast<GlobalValue *>(GV)); - return true; - } - return false; - }); - - if (UpdateClonedDefSource) - for (auto *GV : ClonedDefsInSrc) - UpdateClonedDefSource(*GV); - - BitcodeWriter BCWriter(ClonedModuleBuffer); - BCWriter.writeModule(*Tmp); - BCWriter.writeSymtab(); - BCWriter.writeStrtab(); + + ModuleName = M.getModuleIdentifier(); + std::set<GlobalValue *> ClonedDefsInSrc; + ValueToValueMapTy VMap; + auto Tmp = CloneModule(M, VMap, [&](const GlobalValue *GV) { + if (ShouldCloneDef(*GV)) { + ClonedDefsInSrc.insert(const_cast<GlobalValue *>(GV)); + return true; + } + return false; }); + if (UpdateClonedDefSource) + for (auto *GV : ClonedDefsInSrc) + UpdateClonedDefSource(*GV); + + BitcodeWriter BCWriter(ClonedModuleBuffer); + BCWriter.writeModule(*Tmp); + BCWriter.writeSymtab(); + BCWriter.writeStrtab(); + + return {std::move(ModuleName), std::move(ClonedModuleBuffer)}; +} + +ThreadSafeModule +deserializeModule(std::string ModuleName, + const SmallVector<char, 1> &ClonedModuleBuffer, + ThreadSafeContext TSCtx) { MemoryBufferRef ClonedModuleBufferRef( StringRef(ClonedModuleBuffer.data(), ClonedModuleBuffer.size()), "cloned module buffer"); @@ -63,6 +62,40 @@ ThreadSafeModule cloneToContext(const ThreadSafeModule &TSM, return ThreadSafeModule(std::move(M), std::move(TSCtx)); } +ThreadSafeModule +cloneExternalModuleToContext(const Module &M, ThreadSafeContext TSCtx, + GVPredicate ShouldCloneDef, + GVModifier UpdateClonedDefSource) { + + if (!ShouldCloneDef) + ShouldCloneDef = [](const GlobalValue &) { return true; }; + + auto [ModuleName, ClonedModuleBuffer] = serializeModule( + M, std::move(ShouldCloneDef), std::move(UpdateClonedDefSource)); + + return deserializeModule(std::move(ModuleName), ClonedModuleBuffer, + std::move(TSCtx)); +} + +ThreadSafeModule cloneToContext(const ThreadSafeModule &TSM, + ThreadSafeContext TSCtx, + GVPredicate ShouldCloneDef, + GVModifier UpdateClonedDefSource) { + assert(TSM && "Can not clone null module"); + + if (!ShouldCloneDef) + ShouldCloneDef = [](const GlobalValue &) { return true; }; + + // First copy the source module into a buffer. + auto [ModuleName, ClonedModuleBuffer] = TSM.withModuleDo([&](Module &M) { + return serializeModule(M, std::move(ShouldCloneDef), + std::move(UpdateClonedDefSource)); + }); + + return deserializeModule(std::move(ModuleName), ClonedModuleBuffer, + std::move(TSCtx)); +} + ThreadSafeModule cloneToNewContext(const ThreadSafeModule &TSM, GVPredicate ShouldCloneDef, GVModifier UpdateClonedDefSource) { diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index 08d6c78..d626803 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -654,11 +654,10 @@ bool RuntimeDyldELF::resolveLoongArch64ShortBranch( if (Loc == GlobalSymbolTable.end()) return false; const auto &SymInfo = Loc->second; - Address = - uint64_t(Sections[SymInfo.getSectionID()].getLoadAddressWithOffset( - SymInfo.getOffset())); + Address = Sections[SymInfo.getSectionID()].getLoadAddressWithOffset( + SymInfo.getOffset()); } else { - Address = uint64_t(Sections[Value.SectionID].getLoadAddress()); + Address = Sections[Value.SectionID].getLoadAddress(); } uint64_t Offset = RelI->getOffset(); uint64_t SourceAddress = Sections[SectionID].getLoadAddressWithOffset(Offset); diff --git a/llvm/lib/Frontend/HLSL/HLSLBinding.cpp b/llvm/lib/Frontend/HLSL/HLSLBinding.cpp index d581311..4539146 100644 --- a/llvm/lib/Frontend/HLSL/HLSLBinding.cpp +++ b/llvm/lib/Frontend/HLSL/HLSLBinding.cpp @@ -76,7 +76,7 @@ BindingInfo BindingInfoBuilder::calculateBindingInfo( // remove duplicates Binding *NewEnd = llvm::unique(Bindings); if (NewEnd != Bindings.end()) - Bindings.erase(NewEnd); + Bindings.erase(NewEnd, Bindings.end()); BindingInfo Info; diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp index 574883e..92c62b8 100644 --- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp +++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Frontend/HLSL/HLSLRootSignature.h" +#include "llvm/Support/DXILABI.h" #include "llvm/Support/ScopedPrinter.h" namespace llvm { @@ -92,10 +93,9 @@ static raw_ostream &operator<<(raw_ostream &OS, return OS; } -static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) { - OS << enumToStringRef(dxil::ResourceClass(llvm::to_underlying(Type)), - dxbc::getResourceClasses()); - +static raw_ostream &operator<<(raw_ostream &OS, + const dxil::ResourceClass &Type) { + OS << dxil::getResourceClassName(Type); return OS; } @@ -153,8 +153,7 @@ raw_ostream &operator<<(raw_ostream &OS, const DescriptorTableClause &Clause) { } raw_ostream &operator<<(raw_ostream &OS, const RootDescriptor &Descriptor) { - ClauseType Type = ClauseType(llvm::to_underlying(Descriptor.Type)); - OS << "Root" << Type << "(" << Descriptor.Reg + OS << "Root" << Descriptor.Type << "(" << Descriptor.Reg << ", space = " << Descriptor.Space << ", visibility = " << Descriptor.Visibility << ", flags = " << Descriptor.Flags << ")"; diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp index 1cda308..dece8f1 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp @@ -15,6 +15,7 @@ #include "llvm/Frontend/HLSL/RootSignatureValidations.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Metadata.h" +#include "llvm/Support/DXILABI.h" #include "llvm/Support/ScopedPrinter.h" using namespace llvm; @@ -119,9 +120,7 @@ MDNode *MetadataBuilder::BuildRootConstants(const RootConstants &Constants) { MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) { IRBuilder<> Builder(Ctx); - StringRef ResName = - enumToStringRef(dxil::ResourceClass(to_underlying(Descriptor.Type)), - dxbc::getResourceClasses()); + StringRef ResName = dxil::getResourceClassName(Descriptor.Type); assert(!ResName.empty() && "Provided an invalid Resource Class"); SmallString<7> Name({"Root", ResName}); Metadata *Operands[] = { @@ -161,9 +160,7 @@ MDNode *MetadataBuilder::BuildDescriptorTable(const DescriptorTable &Table) { MDNode *MetadataBuilder::BuildDescriptorTableClause( const DescriptorTableClause &Clause) { IRBuilder<> Builder(Ctx); - StringRef ResName = - enumToStringRef(dxil::ResourceClass(to_underlying(Clause.Type)), - dxbc::getResourceClasses()); + StringRef ResName = dxil::getResourceClassName(Clause.Type); assert(!ResName.empty() && "Provided an invalid Resource Class"); Metadata *Operands[] = { MDString::get(Ctx, ResName), diff --git a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp index 9d84aa8..72308a3d 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp @@ -29,7 +29,7 @@ bool verifyRegisterValue(uint32_t RegisterValue) { // This Range is reserverved, therefore invalid, according to the spec // https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#all-the-values-should-be-legal bool verifyRegisterSpace(uint32_t RegisterSpace) { - return !(RegisterSpace >= 0xFFFFFFF0 && RegisterSpace <= 0xFFFFFFFF); + return !(RegisterSpace >= 0xFFFFFFF0); } bool verifyRootDescriptorFlag(uint32_t Version, uint32_t FlagsVal) { diff --git a/llvm/lib/Frontend/OpenMP/OMP.cpp b/llvm/lib/Frontend/OpenMP/OMP.cpp index 555e2a6..9e625b8 100644 --- a/llvm/lib/Frontend/OpenMP/OMP.cpp +++ b/llvm/lib/Frontend/OpenMP/OMP.cpp @@ -190,7 +190,7 @@ bool isCombinedConstruct(Directive D) { } ArrayRef<unsigned> getOpenMPVersions() { - static unsigned Versions[]{31, 40, 45, 50, 51, 52, 60}; + static unsigned Versions[]{31, 40, 45, 50, 51, 52, 60, 61}; return Versions; } diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index ea027e4..e9147a4 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -151,6 +151,18 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) { } #endif +/// This is wrapper over IRBuilderBase::restoreIP that also restores the current +/// debug location to the last instruction in the specified basic block if the +/// insert point points to the end of the block. +static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, + llvm::IRBuilderBase::InsertPoint IP) { + Builder.restoreIP(IP); + llvm::BasicBlock *BB = Builder.GetInsertBlock(); + llvm::BasicBlock::iterator I = Builder.GetInsertPoint(); + if (!BB->empty() && I == BB->end()) + Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc()); +} + static const omp::GV &getGridValue(const Triple &T, Function *Kernel) { if (T.isAMDGPU()) { StringRef Features = @@ -5918,7 +5930,7 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop, createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd"); } - SmallSet<BasicBlock *, 8> Reachable; + SmallPtrSet<BasicBlock *, 8> Reachable; // Get the basic blocks from the loop in which memref instructions // can be found. @@ -7235,7 +7247,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData( BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv); if (!AfterIP) return AfterIP.takeError(); - Builder.restoreIP(*AfterIP); + restoreIPandDebugLoc(Builder, *AfterIP); if (IfCond) return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP); @@ -8993,7 +9005,7 @@ Error OpenMPIRBuilder::emitOffloadingArrays( ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs); Info.RTArgs.SizesArray = Builder.CreateAlloca( SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes"); - Builder.restoreIP(CodeGenIP); + restoreIPandDebugLoc(Builder, CodeGenIP); } else { auto *SizesArrayInit = ConstantArray::get( ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes); @@ -9012,7 +9024,7 @@ Error OpenMPIRBuilder::emitOffloadingArrays( AllocaInst *Buffer = Builder.CreateAlloca( SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes"); Buffer->setAlignment(OffloadSizeAlign); - Builder.restoreIP(CodeGenIP); + restoreIPandDebugLoc(Builder, CodeGenIP); Builder.CreateMemCpy( Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()), SizesArrayGbl, OffloadSizeAlign, @@ -9022,7 +9034,7 @@ Error OpenMPIRBuilder::emitOffloadingArrays( Info.RTArgs.SizesArray = Buffer; } - Builder.restoreIP(CodeGenIP); + restoreIPandDebugLoc(Builder, CodeGenIP); } // The map types are always constant so we don't need to generate code to diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 7159107..e200f36 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1311,14 +1311,15 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, } break; case 'l': - if (Name.starts_with("lifetime.start") || - Name.starts_with("lifetime.end")) { - // Unless remangling is required, do not upgrade the function declaration, - // but do upgrade the calls. - if (auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F)) - NewFn = *Result; - else - NewFn = F; + if ((Name.starts_with("lifetime.start") || + Name.starts_with("lifetime.end")) && + F->arg_size() == 2) { + Intrinsic::ID IID = Name.starts_with("lifetime.start") + ? Intrinsic::lifetime_start + : Intrinsic::lifetime_end; + rename(F); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID, + F->getArg(0)->getType()); return true; } break; @@ -5133,21 +5134,20 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: { - Value *Size = CI->getArgOperand(0); - Value *Ptr = CI->getArgOperand(1); - if (isa<AllocaInst>(Ptr)) { + if (CI->arg_size() != 2) { DefaultCase(); return; } + Value *Ptr = CI->getArgOperand(1); // Try to strip pointer casts, such that the lifetime works on an alloca. Ptr = Ptr->stripPointerCasts(); if (isa<AllocaInst>(Ptr)) { // Don't use NewFn, as we might have looked through an addrspacecast. if (NewFn->getIntrinsicID() == Intrinsic::lifetime_start) - NewCall = Builder.CreateLifetimeStart(Ptr, cast<ConstantInt>(Size)); + NewCall = Builder.CreateLifetimeStart(Ptr); else - NewCall = Builder.CreateLifetimeEnd(Ptr, cast<ConstantInt>(Size)); + NewCall = Builder.CreateLifetimeEnd(Ptr); break; } @@ -5391,7 +5391,7 @@ void llvm::UpgradeNVVMAnnotations(Module &M) { return; SmallVector<MDNode *, 8> NewNodes; - SmallSet<const MDNode *, 8> SeenNodes; + SmallPtrSet<const MDNode *, 8> SeenNodes; for (MDNode *MD : NamedMD->operands()) { if (!SeenNodes.insert(MD).second) continue; diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp index d4ad21e..6b202ba 100644 --- a/llvm/lib/IR/ConstantFold.cpp +++ b/llvm/lib/IR/ConstantFold.cpp @@ -254,6 +254,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V, return FoldBitCast(V, DestTy); case Instruction::AddrSpaceCast: case Instruction::IntToPtr: + case Instruction::PtrToAddr: case Instruction::PtrToInt: return nullptr; } diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp index e09c139..b454c9a 100644 --- a/llvm/lib/IR/ConstantRange.cpp +++ b/llvm/lib/IR/ConstantRange.cpp @@ -829,6 +829,7 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp, case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::IntToPtr: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::AddrSpaceCast: // Conservatively return getFull set. @@ -871,7 +872,8 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const { return ConstantRange(Lower.sext(DstTySize), Upper.sext(DstTySize)); } -ConstantRange ConstantRange::truncate(uint32_t DstTySize) const { +ConstantRange ConstantRange::truncate(uint32_t DstTySize, + unsigned NoWrapKind) const { assert(getBitWidth() > DstTySize && "Not a value truncation"); if (isEmptySet()) return getEmpty(DstTySize); @@ -885,22 +887,36 @@ ConstantRange ConstantRange::truncate(uint32_t DstTySize) const { // We use the non-wrapped set code to analyze the [Lower, MaxValue) part, and // then we do the union with [MaxValue, Upper) if (isUpperWrapped()) { - // If Upper is greater than or equal to MaxValue(DstTy), it covers the whole - // truncated range. - if (Upper.getActiveBits() > DstTySize || Upper.countr_one() == DstTySize) + // If Upper is greater than MaxValue(DstTy), it covers the whole truncated + // range. + if (Upper.getActiveBits() > DstTySize) return getFull(DstTySize); - Union = ConstantRange(APInt::getMaxValue(DstTySize),Upper.trunc(DstTySize)); - UpperDiv.setAllBits(); - - // Union covers the MaxValue case, so return if the remaining range is just - // MaxValue(DstTy). - if (LowerDiv == UpperDiv) - return Union; + // For nuw the two parts are: [0, Upper) \/ [Lower, MaxValue(DstTy)] + if (NoWrapKind & TruncInst::NoUnsignedWrap) { + Union = ConstantRange(APInt::getZero(DstTySize), Upper.trunc(DstTySize)); + UpperDiv = APInt::getOneBitSet(getBitWidth(), DstTySize); + } else { + // If Upper is equal to MaxValue(DstTy), it covers the whole truncated + // range. + if (Upper.countr_one() == DstTySize) + return getFull(DstTySize); + Union = + ConstantRange(APInt::getMaxValue(DstTySize), Upper.trunc(DstTySize)); + UpperDiv.setAllBits(); + // Union covers the MaxValue case, so return if the remaining range is + // just MaxValue(DstTy). + if (LowerDiv == UpperDiv) + return Union; + } } // Chop off the most significant bits that are past the destination bitwidth. if (LowerDiv.getActiveBits() > DstTySize) { + // For trunc nuw if LowerDiv is greater than MaxValue(DstTy), the range is + // outside the whole truncated range. + if (NoWrapKind & TruncInst::NoUnsignedWrap) + return Union; // Mask to just the signficant bits and subtract from LowerDiv/UpperDiv. APInt Adjust = LowerDiv & APInt::getBitsSetFrom(getBitWidth(), DstTySize); LowerDiv -= Adjust; @@ -912,6 +928,10 @@ ConstantRange ConstantRange::truncate(uint32_t DstTySize) const { return ConstantRange(LowerDiv.trunc(DstTySize), UpperDiv.trunc(DstTySize)).unionWith(Union); + if (!LowerDiv.isZero() && NoWrapKind & TruncInst::NoUnsignedWrap) + return ConstantRange(LowerDiv.trunc(DstTySize), APInt::getZero(DstTySize)) + .unionWith(Union); + // The truncated value wraps around. Check if we can do better than fullset. if (UpperDivWidth == DstTySize + 1) { // Clear the MSB so that UpperDiv wraps around. diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index a3c725b..c7e3113a 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -1567,6 +1567,7 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty, case Instruction::SIToFP: case Instruction::FPToUI: case Instruction::FPToSI: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: @@ -2223,6 +2224,8 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty, llvm_unreachable("Invalid cast opcode"); case Instruction::Trunc: return getTrunc(C, Ty, OnlyIfReduced); + case Instruction::PtrToAddr: + return getPtrToAddr(C, Ty, OnlyIfReduced); case Instruction::PtrToInt: return getPtrToInt(C, Ty, OnlyIfReduced); case Instruction::IntToPtr: @@ -2280,6 +2283,20 @@ Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) { return getFoldedCast(Instruction::Trunc, C, Ty, OnlyIfReduced); } +Constant *ConstantExpr::getPtrToAddr(Constant *C, Type *DstTy, + bool OnlyIfReduced) { + assert(C->getType()->isPtrOrPtrVectorTy() && + "PtrToAddr source must be pointer or pointer vector"); + assert(DstTy->isIntOrIntVectorTy() && + "PtrToAddr destination must be integer or integer vector"); + assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy)); + if (isa<VectorType>(C->getType())) + assert(cast<VectorType>(C->getType())->getElementCount() == + cast<VectorType>(DstTy)->getElementCount() && + "Invalid cast between a different number of vector elements"); + return getFoldedCast(Instruction::PtrToAddr, C, DstTy, OnlyIfReduced); +} + Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy, bool OnlyIfReduced) { assert(C->getType()->isPtrOrPtrVectorTy() && @@ -2435,6 +2452,7 @@ bool ConstantExpr::isDesirableCastOp(unsigned Opcode) { case Instruction::FPToSI: return false; case Instruction::Trunc: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: @@ -2457,6 +2475,7 @@ bool ConstantExpr::isSupportedCastOp(unsigned Opcode) { case Instruction::FPToSI: return false; case Instruction::Trunc: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: @@ -3401,6 +3420,7 @@ Instruction *ConstantExpr::getAsInstruction() const { switch (getOpcode()) { case Instruction::Trunc: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index f7ef4aa..8b5965b 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -2186,6 +2186,11 @@ void LLVMGlobalSetMetadata(LLVMValueRef Global, unsigned Kind, unwrap<GlobalObject>(Global)->setMetadata(Kind, unwrap<MDNode>(MD)); } +void LLVMGlobalAddMetadata(LLVMValueRef Global, unsigned Kind, + LLVMMetadataRef MD) { + unwrap<GlobalObject>(Global)->addMetadata(Kind, *unwrap<MDNode>(MD)); +} + void LLVMGlobalEraseMetadata(LLVMValueRef Global, unsigned Kind) { unwrap<GlobalObject>(Global)->eraseMetadata(Kind); } @@ -2194,6 +2199,11 @@ void LLVMGlobalClearMetadata(LLVMValueRef Global) { unwrap<GlobalObject>(Global)->clearMetadata(); } +void LLVMGlobalAddDebugInfo(LLVMValueRef Global, LLVMMetadataRef GVE) { + unwrap<GlobalVariable>(Global)->addDebugInfo( + unwrap<DIGlobalVariableExpression>(GVE)); +} + /*--.. Operations on global variables ......................................--*/ LLVMValueRef LLVMAddGlobal(LLVMModuleRef M, LLVMTypeRef Ty, const char *Name) { diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index ab8ecee..8e523bc 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -1896,29 +1896,8 @@ AssignmentInstRange at::getAssignmentInsts(DIAssignID *ID) { return make_range(MapIt->second.begin(), MapIt->second.end()); } -AssignmentMarkerRange at::getAssignmentMarkers(DIAssignID *ID) { - assert(ID && "Expected non-null ID"); - LLVMContext &Ctx = ID->getContext(); - - auto *IDAsValue = MetadataAsValue::getIfExists(Ctx, ID); - - // The ID is only used wrapped in MetadataAsValue(ID), so lets check that - // one of those already exists first. - if (!IDAsValue) - return make_range(Value::user_iterator(), Value::user_iterator()); - - return make_range(IDAsValue->user_begin(), IDAsValue->user_end()); -} - void at::deleteAssignmentMarkers(const Instruction *Inst) { - auto Range = getAssignmentMarkers(Inst); - SmallVector<DbgVariableRecord *> DVRAssigns = getDVRAssignmentMarkers(Inst); - if (Range.empty() && DVRAssigns.empty()) - return; - SmallVector<DbgAssignIntrinsic *> ToDelete(Range.begin(), Range.end()); - for (auto *DAI : ToDelete) - DAI->eraseFromParent(); - for (auto *DVR : DVRAssigns) + for (auto *DVR : getDVRAssignmentMarkers(Inst)) DVR->eraseFromParent(); } @@ -1936,31 +1915,21 @@ void at::RAUW(DIAssignID *Old, DIAssignID *New) { } void at::deleteAll(Function *F) { - SmallVector<DbgAssignIntrinsic *, 12> ToDelete; - SmallVector<DbgVariableRecord *, 12> DPToDelete; for (BasicBlock &BB : *F) { for (Instruction &I : BB) { - for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) + for (DbgVariableRecord &DVR : + make_early_inc_range(filterDbgVars(I.getDbgRecordRange()))) if (DVR.isDbgAssign()) - DPToDelete.push_back(&DVR); - if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I)) - ToDelete.push_back(DAI); - else - I.setMetadata(LLVMContext::MD_DIAssignID, nullptr); + DVR.eraseFromParent(); + + I.setMetadata(LLVMContext::MD_DIAssignID, nullptr); } } - for (auto *DAI : ToDelete) - DAI->eraseFromParent(); - for (auto *DVR : DPToDelete) - DVR->eraseFromParent(); } -/// FIXME: Remove this wrapper function and call -/// DIExpression::calculateFragmentIntersect directly. -template <typename T> -bool calculateFragmentIntersectImpl( +bool at::calculateFragmentIntersect( const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits, - uint64_t SliceSizeInBits, const T *AssignRecord, + uint64_t SliceSizeInBits, const DbgVariableRecord *AssignRecord, std::optional<DIExpression::FragmentInfo> &Result) { // No overlap if this DbgRecord describes a killed location. if (AssignRecord->isKillAddress()) @@ -1989,26 +1958,6 @@ bool calculateFragmentIntersectImpl( BitExtractOffsetInBits, VarFrag, Result, OffsetFromLocationInBits); } -/// FIXME: Remove this wrapper function and call -/// DIExpression::calculateFragmentIntersect directly. -bool at::calculateFragmentIntersect( - const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits, - uint64_t SliceSizeInBits, const DbgAssignIntrinsic *DbgAssign, - std::optional<DIExpression::FragmentInfo> &Result) { - return calculateFragmentIntersectImpl(DL, Dest, SliceOffsetInBits, - SliceSizeInBits, DbgAssign, Result); -} - -/// FIXME: Remove this wrapper function and call -/// DIExpression::calculateFragmentIntersect directly. -bool at::calculateFragmentIntersect( - const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits, - uint64_t SliceSizeInBits, const DbgVariableRecord *DVRAssign, - std::optional<DIExpression::FragmentInfo> &Result) { - return calculateFragmentIntersectImpl(DL, Dest, SliceOffsetInBits, - SliceSizeInBits, DVRAssign, Result); -} - /// Update inlined instructions' DIAssignID metadata. We need to do this /// otherwise a function inlined more than once into the same function /// will cause DIAssignID to be shared by many instructions. @@ -2029,8 +1978,6 @@ void at::remapAssignID(DenseMap<DIAssignID *, DIAssignID *> &Map, } if (auto *ID = I.getMetadata(LLVMContext::MD_DIAssignID)) I.setMetadata(LLVMContext::MD_DIAssignID, GetNewID(ID)); - else if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I)) - DAI->setAssignId(GetNewID(DAI->getAssignID())); } /// Collect constant properies (base, size, offset) of \p StoreDest. diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index f1d4549..96065ed 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -57,15 +57,9 @@ DebugVariable::DebugVariable(const DbgVariableRecord *DVR) DILocation::DILocation(LLVMContext &C, StorageType Storage, unsigned Line, unsigned Column, uint64_t AtomGroup, uint8_t AtomRank, ArrayRef<Metadata *> MDs, bool ImplicitCode) - : MDNode(C, DILocationKind, Storage, MDs) -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS - , - AtomGroup(AtomGroup), AtomRank(AtomRank) -#endif -{ -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS + : MDNode(C, DILocationKind, Storage, MDs), AtomGroup(AtomGroup), + AtomRank(AtomRank) { assert(AtomRank <= 7 && "AtomRank number should fit in 3 bits"); -#endif if (AtomGroup) C.updateDILocationAtomGroupWaterline(AtomGroup + 1); diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp index 7b799c7..11d33e2 100644 --- a/llvm/lib/IR/Globals.cpp +++ b/llvm/lib/IR/Globals.cpp @@ -404,6 +404,7 @@ findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases, return findBaseObject(CE->getOperand(0), Aliases, Op); } case Instruction::IntToPtr: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::BitCast: case Instruction::GetElementPtr: diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 49c6dc7..614c3a9 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -411,28 +411,16 @@ CallInst *IRBuilderBase::CreateFPMinimumReduce(Value *Src) { return getReductionIntrinsic(Intrinsic::vector_reduce_fminimum, Src); } -CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) { +CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr) { assert(isa<PointerType>(Ptr->getType()) && "lifetime.start only applies to pointers."); - if (!Size) - Size = getInt64(-1); - else - assert(Size->getType() == getInt64Ty() && - "lifetime.start requires the size to be an i64"); - Value *Ops[] = { Size, Ptr }; - return CreateIntrinsic(Intrinsic::lifetime_start, {Ptr->getType()}, Ops); + return CreateIntrinsic(Intrinsic::lifetime_start, {Ptr->getType()}, {Ptr}); } -CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) { +CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr) { assert(isa<PointerType>(Ptr->getType()) && "lifetime.end only applies to pointers."); - if (!Size) - Size = getInt64(-1); - else - assert(Size->getType() == getInt64Ty() && - "lifetime.end requires the size to be an i64"); - Value *Ops[] = { Size, Ptr }; - return CreateIntrinsic(Intrinsic::lifetime_end, {Ptr->getType()}, Ops); + return CreateIntrinsic(Intrinsic::lifetime_end, {Ptr->getType()}, {Ptr}); } CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) { diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index b7cd12a..5e87b5f 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -26,9 +26,18 @@ #include "llvm/IR/Operator.h" #include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" using namespace llvm; +// FIXME: Flag used for an ablation performance test, Issue #147390. Placing it +// here because referencing IR should be feasible from anywhere. Will be +// removed after the ablation test. +cl::opt<bool> ProfcheckDisableMetadataFixes( + "profcheck-disable-metadata-fixes", cl::Hidden, cl::init(false), + cl::desc( + "Disable metadata propagation fixes discovered through Issue #147390")); + InsertPosition::InsertPosition(Instruction *InsertBefore) : InsertAt(InsertBefore ? InsertBefore->getIterator() : InstListType::iterator()) {} @@ -543,14 +552,19 @@ void Instruction::dropUBImplyingAttrsAndUnknownMetadata( CB->removeRetAttrs(UBImplyingAttributes); } -void Instruction::dropUBImplyingAttrsAndMetadata() { +void Instruction::dropUBImplyingAttrsAndMetadata(ArrayRef<unsigned> Keep) { // !annotation metadata does not impact semantics. // !range, !nonnull and !align produce poison, so they are safe to speculate. // !noundef and various AA metadata must be dropped, as it generally produces // immediate undefined behavior. - unsigned KnownIDs[] = {LLVMContext::MD_annotation, LLVMContext::MD_range, - LLVMContext::MD_nonnull, LLVMContext::MD_align}; - dropUBImplyingAttrsAndUnknownMetadata(KnownIDs); + static const unsigned KnownIDs[] = { + LLVMContext::MD_annotation, LLVMContext::MD_range, + LLVMContext::MD_nonnull, LLVMContext::MD_align}; + SmallVector<unsigned> KeepIDs; + KeepIDs.reserve(Keep.size() + std::size(KnownIDs)); + append_range(KeepIDs, KnownIDs); + append_range(KeepIDs, Keep); + dropUBImplyingAttrsAndUnknownMetadata(KeepIDs); } bool Instruction::hasUBImplyingAttrs() const { @@ -817,6 +831,7 @@ const char *Instruction::getOpcodeName(unsigned OpCode) { case UIToFP: return "uitofp"; case SIToFP: return "sitofp"; case IntToPtr: return "inttoptr"; + case PtrToAddr: return "ptrtoaddr"; case PtrToInt: return "ptrtoint"; case BitCast: return "bitcast"; case AddrSpaceCast: return "addrspacecast"; diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index b896382..a1751c0 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -2798,6 +2798,7 @@ bool CastInst::isNoopCast(Instruction::CastOps Opcode, return false; case Instruction::BitCast: return true; // BitCast never modifies bits. + case Instruction::PtrToAddr: case Instruction::PtrToInt: return DL.getIntPtrType(SrcTy)->getScalarSizeInBits() == DestTy->getScalarSizeInBits(); @@ -2855,26 +2856,29 @@ unsigned CastInst::isEliminableCastPair( // same reason. const unsigned numCastOps = Instruction::CastOpsEnd - Instruction::CastOpsBegin; + // clang-format off static const uint8_t CastResults[numCastOps][numCastOps] = { - // T F F U S F F P I B A -+ - // R Z S P P I I T P 2 N T S | - // U E E 2 2 2 2 R E I T C C +- secondOp - // N X X U S F F N X N 2 V V | - // C T T I I P P C T T P T T -+ - { 1, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // Trunc -+ - { 8, 1, 9,99,99, 2,17,99,99,99, 2, 3, 0}, // ZExt | - { 8, 0, 1,99,99, 0, 2,99,99,99, 0, 3, 0}, // SExt | - { 0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToUI | - { 0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToSI | - { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // UIToFP +- firstOp - { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // SIToFP | - { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // FPTrunc | - { 99,99,99, 2, 2,99,99, 8, 2,99,99, 4, 0}, // FPExt | - { 1, 0, 0,99,99, 0, 0,99,99,99, 7, 3, 0}, // PtrToInt | - { 99,99,99,99,99,99,99,99,99,11,99,15, 0}, // IntToPtr | - { 5, 5, 5, 0, 0, 5, 5, 0, 0,16, 5, 1,14}, // BitCast | - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+ + // T F F U S F F P P I B A -+ + // R Z S P P I I T P 2 2 N T S | + // U E E 2 2 2 2 R E I A T C C +- secondOp + // N X X U S F F N X N D 2 V V | + // C T T I I P P C T T R P T T -+ + { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // Trunc -+ + { 8, 1, 9,99,99, 2,17,99,99,99,99, 2, 3, 0}, // ZExt | + { 8, 0, 1,99,99, 0, 2,99,99,99,99, 0, 3, 0}, // SExt | + { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToUI | + { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToSI | + { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // UIToFP +- firstOp + { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // SIToFP | + { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // FPTrunc | + { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt | + { 1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt | + { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr | + { 99,99,99,99,99,99,99,99,99,11,99,99,15, 0}, // IntToPtr | + { 5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast | + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+ }; + // clang-format on // TODO: This logic could be encoded into the table above and handled in the // switch below. @@ -3046,6 +3050,7 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty, case SIToFP: return new SIToFPInst (S, Ty, Name, InsertBefore); case FPToUI: return new FPToUIInst (S, Ty, Name, InsertBefore); case FPToSI: return new FPToSIInst (S, Ty, Name, InsertBefore); + case PtrToAddr: return new PtrToAddrInst (S, Ty, Name, InsertBefore); case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertBefore); case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertBefore); case BitCast: @@ -3347,6 +3352,7 @@ CastInst::castIsValid(Instruction::CastOps op, Type *SrcTy, Type *DstTy) { case Instruction::FPToSI: return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy() && SrcEC == DstEC; + case Instruction::PtrToAddr: case Instruction::PtrToInt: if (SrcEC != DstEC) return false; @@ -3460,6 +3466,12 @@ PtrToIntInst::PtrToIntInst(Value *S, Type *Ty, const Twine &Name, assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt"); } +PtrToAddrInst::PtrToAddrInst(Value *S, Type *Ty, const Twine &Name, + InsertPosition InsertBefore) + : CastInst(Ty, PtrToAddr, S, Name, InsertBefore) { + assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToAddr"); +} + IntToPtrInst::IntToPtrInst(Value *S, Type *Ty, const Twine &Name, InsertPosition InsertBefore) : CastInst(Ty, IntToPtr, S, Name, InsertBefore) { @@ -4427,6 +4439,10 @@ PtrToIntInst *PtrToIntInst::cloneImpl() const { return new PtrToIntInst(getOperand(0), getType()); } +PtrToAddrInst *PtrToAddrInst::cloneImpl() const { + return new PtrToAddrInst(getOperand(0), getType()); +} + IntToPtrInst *IntToPtrInst::cloneImpl() const { return new IntToPtrInst(getOperand(0), getType()); } diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index aa2a60e..e03f993 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -312,10 +312,8 @@ template <> struct MDNodeKeyImpl<MDTuple> : MDNodeOpsKey { template <> struct MDNodeKeyImpl<DILocation> { Metadata *Scope; Metadata *InlinedAt; -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS uint64_t AtomGroup : 61; uint64_t AtomRank : 3; -#endif unsigned Line; uint16_t Column; bool ImplicitCode; @@ -323,36 +321,24 @@ template <> struct MDNodeKeyImpl<DILocation> { MDNodeKeyImpl(unsigned Line, uint16_t Column, Metadata *Scope, Metadata *InlinedAt, bool ImplicitCode, uint64_t AtomGroup, uint8_t AtomRank) - : Scope(Scope), InlinedAt(InlinedAt), -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS - AtomGroup(AtomGroup), AtomRank(AtomRank), -#endif - Line(Line), Column(Column), ImplicitCode(ImplicitCode) { - } + : Scope(Scope), InlinedAt(InlinedAt), AtomGroup(AtomGroup), + AtomRank(AtomRank), Line(Line), Column(Column), + ImplicitCode(ImplicitCode) {} MDNodeKeyImpl(const DILocation *L) : Scope(L->getRawScope()), InlinedAt(L->getRawInlinedAt()), -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS AtomGroup(L->getAtomGroup()), AtomRank(L->getAtomRank()), -#endif Line(L->getLine()), Column(L->getColumn()), - ImplicitCode(L->isImplicitCode()) { - } + ImplicitCode(L->isImplicitCode()) {} bool isKeyOf(const DILocation *RHS) const { return Line == RHS->getLine() && Column == RHS->getColumn() && Scope == RHS->getRawScope() && InlinedAt == RHS->getRawInlinedAt() && - ImplicitCode == RHS->isImplicitCode() -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS - && AtomGroup == RHS->getAtomGroup() && - AtomRank == RHS->getAtomRank(); -#else - ; -#endif + ImplicitCode == RHS->isImplicitCode() && + AtomGroup == RHS->getAtomGroup() && AtomRank == RHS->getAtomRank(); } unsigned getHashValue() const { -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS // Hashing AtomGroup and AtomRank substantially impacts performance whether // Key Instructions is enabled or not. We can't detect whether it's enabled // here cheaply; avoiding hashing zero values is a good approximation. This @@ -363,7 +349,6 @@ template <> struct MDNodeKeyImpl<DILocation> { if (AtomGroup || AtomRank) return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode, AtomGroup, (uint8_t)AtomRank); -#endif return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode); } }; diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp index b1b5f67..d24263f 100644 --- a/llvm/lib/IR/ProfDataUtils.cpp +++ b/llvm/lib/IR/ProfDataUtils.cpp @@ -270,6 +270,18 @@ void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights, I.setMetadata(LLVMContext::MD_prof, BranchWeights); } +SmallVector<uint32_t> downscaleWeights(ArrayRef<uint64_t> Weights, + std::optional<uint64_t> KnownMaxCount) { + uint64_t MaxCount = KnownMaxCount.has_value() ? KnownMaxCount.value() + : *llvm::max_element(Weights); + assert(MaxCount > 0 && "Bad max count"); + uint64_t Scale = calculateCountScale(MaxCount); + SmallVector<uint32_t> DownscaledWeights; + for (const auto &ECI : Weights) + DownscaledWeights.push_back(scaleBranchCount(ECI, Scale)); + return DownscaledWeights; +} + void scaleProfData(Instruction &I, uint64_t S, uint64_t T) { assert(T != 0 && "Caller should guarantee"); auto *ProfileData = I.getMetadata(LLVMContext::MD_prof); diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index a8e6c79..3c324f2 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -9,6 +9,8 @@ #include "llvm/IR/RuntimeLibcalls.h" #include "llvm/ADT/StringTable.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/xxhash.h" +#include "llvm/TargetParser/ARMTargetParser.h" #define DEBUG_TYPE "runtime-libcalls-info" @@ -17,51 +19,11 @@ using namespace RTLIB; #define GET_INIT_RUNTIME_LIBCALL_NAMES #define GET_SET_TARGET_RUNTIME_LIBCALL_SETS +#define DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME #include "llvm/IR/RuntimeLibcalls.inc" #undef GET_INIT_RUNTIME_LIBCALL_NAMES #undef GET_SET_TARGET_RUNTIME_LIBCALL_SETS - -static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT, - FloatABI::ABIType FloatABIType, - EABI EABIVersion) { - static const RTLIB::LibcallImpl AAPCS_Libcalls[] = { - RTLIB::__aeabi_dadd, RTLIB::__aeabi_ddiv, - RTLIB::__aeabi_dmul, RTLIB::__aeabi_dsub, - RTLIB::__aeabi_dcmpeq__oeq, RTLIB::__aeabi_dcmpeq__une, - RTLIB::__aeabi_dcmplt, RTLIB::__aeabi_dcmple, - RTLIB::__aeabi_dcmpge, RTLIB::__aeabi_dcmpgt, - RTLIB::__aeabi_dcmpun, RTLIB::__aeabi_fadd, - RTLIB::__aeabi_fdiv, RTLIB::__aeabi_fmul, - RTLIB::__aeabi_fsub, RTLIB::__aeabi_fcmpeq__oeq, - RTLIB::__aeabi_fcmpeq__une, RTLIB::__aeabi_fcmplt, - RTLIB::__aeabi_fcmple, RTLIB::__aeabi_fcmpge, - RTLIB::__aeabi_fcmpgt, RTLIB::__aeabi_fcmpun, - RTLIB::__aeabi_d2iz, RTLIB::__aeabi_d2uiz, - RTLIB::__aeabi_d2lz, RTLIB::__aeabi_d2ulz, - RTLIB::__aeabi_f2iz, RTLIB::__aeabi_f2uiz, - RTLIB::__aeabi_f2lz, RTLIB::__aeabi_f2ulz, - RTLIB::__aeabi_d2f, RTLIB::__aeabi_d2h, - RTLIB::__aeabi_f2d, RTLIB::__aeabi_i2d, - RTLIB::__aeabi_ui2d, RTLIB::__aeabi_l2d, - RTLIB::__aeabi_ul2d, RTLIB::__aeabi_i2f, - RTLIB::__aeabi_ui2f, RTLIB::__aeabi_l2f, - RTLIB::__aeabi_ul2f, RTLIB::__aeabi_lmul, - RTLIB::__aeabi_llsl, RTLIB::__aeabi_llsr, - RTLIB::__aeabi_lasr, RTLIB::__aeabi_idiv, - RTLIB::__aeabi_idivmod, RTLIB::__aeabi_uidivmod, - RTLIB::__aeabi_ldivmod, RTLIB::__aeabi_uidiv, - RTLIB::__aeabi_uldivmod, RTLIB::__aeabi_f2h, - RTLIB::__aeabi_d2h, RTLIB::__aeabi_h2f, - RTLIB::__aeabi_memcpy, RTLIB::__aeabi_memmove, - RTLIB::__aeabi_memset, RTLIB::__aeabi_memcpy4, - RTLIB::__aeabi_memcpy8, RTLIB::__aeabi_memmove4, - RTLIB::__aeabi_memmove8, RTLIB::__aeabi_memset4, - RTLIB::__aeabi_memset8, RTLIB::__aeabi_memclr, - RTLIB::__aeabi_memclr4, RTLIB::__aeabi_memclr8}; - - for (RTLIB::LibcallImpl Impl : AAPCS_Libcalls) - Info.setLibcallImplCallingConv(Impl, CallingConv::ARM_AAPCS); -} +#undef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME /// Set default libcall names. If a target wants to opt-out of a libcall it /// should be placed here. @@ -69,65 +31,51 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, ExceptionHandling ExceptionModel, FloatABI::ABIType FloatABI, EABI EABIVersion, StringRef ABIName) { - setTargetRuntimeLibcallSets(TT, FloatABI); - - if (ExceptionModel == ExceptionHandling::SjLj) - setLibcallImpl(RTLIB::UNWIND_RESUME, RTLIB::_Unwind_SjLj_Resume); + setTargetRuntimeLibcallSets(TT, ExceptionModel, FloatABI, EABIVersion, + ABIName); if (TT.isARM() || TT.isThumb()) { - setARMLibcallNames(*this, TT, FloatABI, EABIVersion); - return; - } + // The half <-> float conversion functions are always soft-float on + // non-watchos platforms, but are needed for some targets which use a + // hard-float calling convention by default. + if (!TT.isWatchABI()) { + if (isAAPCS_ABI(TT, ABIName)) { + setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_AAPCS); + setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_AAPCS); + setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_AAPCS); + } else { + setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_APCS); + setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_APCS); + setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_APCS); + } + } - if (TT.getArch() == Triple::ArchType::msp430) { - setLibcallImplCallingConv(RTLIB::__mspabi_mpyll, - CallingConv::MSP430_BUILTIN); + return; } } -RTLIB::LibcallImpl -RuntimeLibcallsInfo::getSupportedLibcallImpl(StringRef FuncName) const { - const ArrayRef<uint16_t> RuntimeLibcallNameOffsets( - RuntimeLibcallNameOffsetTable); - - iterator_range<ArrayRef<uint16_t>::const_iterator> Range = - getRecognizedLibcallImpls(FuncName); - - for (auto I = Range.begin(); I != Range.end(); ++I) { - RTLIB::LibcallImpl Impl = - static_cast<RTLIB::LibcallImpl>(I - RuntimeLibcallNameOffsets.begin()); - - // FIXME: This should not depend on looking up ImplToLibcall, only the list - // of libcalls for the module. - RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]]; - if (Recognized != RTLIB::Unsupported) - return Recognized; +LLVM_ATTRIBUTE_ALWAYS_INLINE +iota_range<RTLIB::LibcallImpl> +RuntimeLibcallsInfo::libcallImplNameHit(uint16_t NameOffsetEntry, + uint16_t StrOffset) { + int NumAliases = 1; + for (uint16_t Entry : ArrayRef(RuntimeLibcallNameOffsetTable) + .drop_front(NameOffsetEntry + 1)) { + if (Entry != StrOffset) + break; + ++NumAliases; } - return RTLIB::Unsupported; + RTLIB::LibcallImpl ImplStart = static_cast<RTLIB::LibcallImpl>( + &RuntimeLibcallNameOffsetTable[NameOffsetEntry] - + &RuntimeLibcallNameOffsetTable[0]); + return enum_seq(ImplStart, + static_cast<RTLIB::LibcallImpl>(ImplStart + NumAliases)); } -iterator_range<ArrayRef<uint16_t>::const_iterator> -RuntimeLibcallsInfo::getRecognizedLibcallImpls(StringRef FuncName) { - StringTable::Iterator It = lower_bound(RuntimeLibcallImplNameTable, FuncName); - if (It == RuntimeLibcallImplNameTable.end() || *It != FuncName) - return iterator_range(ArrayRef<uint16_t>()); - - uint16_t IndexVal = It.offset().value(); - const ArrayRef<uint16_t> TableRef(RuntimeLibcallNameOffsetTable); - - ArrayRef<uint16_t>::const_iterator E = TableRef.end(); - ArrayRef<uint16_t>::const_iterator EntriesBegin = - std::lower_bound(TableRef.begin(), E, IndexVal); - ArrayRef<uint16_t>::const_iterator EntriesEnd = EntriesBegin; - - while (EntriesEnd != E && *EntriesEnd == IndexVal) - ++EntriesEnd; - - assert(EntriesBegin != E && - "libcall found in name table but not offset table"); - - return make_range(EntriesBegin, EntriesEnd); +bool RuntimeLibcallsInfo::isAAPCS_ABI(const Triple &TT, StringRef ABIName) { + const ARM::ARMABI TargetABI = ARM::computeTargetABI(TT, ABIName); + return TargetABI == ARM::ARM_ABI_AAPCS || TargetABI == ARM::ARM_ABI_AAPCS16; } bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) { diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 129ca4a..5928c89 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -747,34 +747,28 @@ const Value *Value::stripAndAccumulateConstantOffsets( // means when we construct GEPOffset, we need to use the size // of GEP's pointer type rather than the size of the original // pointer type. - unsigned CurBitWidth = DL.getIndexTypeSizeInBits(V->getType()); - if (CurBitWidth == BitWidth) { - if (!GEP->accumulateConstantOffset(DL, Offset, ExternalAnalysis)) - return V; - } else { - APInt GEPOffset(CurBitWidth, 0); - if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis)) - return V; + APInt GEPOffset(DL.getIndexTypeSizeInBits(V->getType()), 0); + if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis)) + return V; - // Stop traversal if the pointer offset wouldn't fit in the bit-width - // provided by the Offset argument. This can happen due to AddrSpaceCast - // stripping. - if (GEPOffset.getSignificantBits() > BitWidth) - return V; + // Stop traversal if the pointer offset wouldn't fit in the bit-width + // provided by the Offset argument. This can happen due to AddrSpaceCast + // stripping. + if (GEPOffset.getSignificantBits() > BitWidth) + return V; - // External Analysis can return a result higher/lower than the value - // represents. We need to detect overflow/underflow. - APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth); - if (!ExternalAnalysis) { - Offset += GEPOffsetST; - } else { - bool Overflow = false; - APInt OldOffset = Offset; - Offset = Offset.sadd_ov(GEPOffsetST, Overflow); - if (Overflow) { - Offset = OldOffset; - return V; - } + // External Analysis can return a result higher/lower than the value + // represents. We need to detect overflow/underflow. + APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth); + if (!ExternalAnalysis) { + Offset += GEPOffsetST; + } else { + bool Overflow = false; + APInt OldOffset = Offset; + Offset = Offset.sadd_ov(GEPOffsetST, Overflow); + if (Overflow) { + Offset = OldOffset; + return V; } } V = GEP->getPointerOperand(); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index ca3f148..9d9b51d 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -566,6 +566,8 @@ private: void visitUIToFPInst(UIToFPInst &I); void visitSIToFPInst(SIToFPInst &I); void visitIntToPtrInst(IntToPtrInst &I); + void checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V); + void visitPtrToAddrInst(PtrToAddrInst &I); void visitPtrToIntInst(PtrToIntInst &I); void visitBitCastInst(BitCastInst &I); void visitAddrSpaceCastInst(AddrSpaceCastInst &I); @@ -834,6 +836,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) { &GV); Check(GV.getInitializer()->getType()->isSized(), "Global variable initializer must be sized", &GV); + visitConstantExprsRecursively(GV.getInitializer()); // If the global has common linkage, it must have a zero initializer and // cannot be constant. if (GV.hasCommonLinkage()) { @@ -2610,6 +2613,8 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) { Check(CastInst::castIsValid(Instruction::BitCast, CE->getOperand(0), CE->getType()), "Invalid bitcast", CE); + else if (CE->getOpcode() == Instruction::PtrToAddr) + checkPtrToAddr(CE->getOperand(0)->getType(), CE->getType(), *CE); } void Verifier::visitConstantPtrAuth(const ConstantPtrAuth *CPA) { @@ -3532,6 +3537,28 @@ void Verifier::visitFPToSIInst(FPToSIInst &I) { visitInstruction(I); } +void Verifier::checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V) { + Check(SrcTy->isPtrOrPtrVectorTy(), "PtrToAddr source must be pointer", V); + Check(DestTy->isIntOrIntVectorTy(), "PtrToAddr result must be integral", V); + Check(SrcTy->isVectorTy() == DestTy->isVectorTy(), "PtrToAddr type mismatch", + V); + + if (SrcTy->isVectorTy()) { + auto *VSrc = cast<VectorType>(SrcTy); + auto *VDest = cast<VectorType>(DestTy); + Check(VSrc->getElementCount() == VDest->getElementCount(), + "PtrToAddr vector length mismatch", V); + } + + Type *AddrTy = DL.getAddressType(SrcTy); + Check(AddrTy == DestTy, "PtrToAddr result must be address width", V); +} + +void Verifier::visitPtrToAddrInst(PtrToAddrInst &I) { + checkPtrToAddr(I.getOperand(0)->getType(), I.getType(), I); + visitInstruction(I); +} + void Verifier::visitPtrToIntInst(PtrToIntInst &I) { // Get the source and destination types Type *SrcTy = I.getOperand(0)->getType(); @@ -3547,7 +3574,7 @@ void Verifier::visitPtrToIntInst(PtrToIntInst &I) { auto *VSrc = cast<VectorType>(SrcTy); auto *VDest = cast<VectorType>(DestTy); Check(VSrc->getElementCount() == VDest->getElementCount(), - "PtrToInt Vector width mismatch", &I); + "PtrToInt Vector length mismatch", &I); } visitInstruction(I); @@ -3567,7 +3594,7 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) { auto *VSrc = cast<VectorType>(SrcTy); auto *VDest = cast<VectorType>(DestTy); Check(VSrc->getElementCount() == VDest->getElementCount(), - "IntToPtr Vector width mismatch", &I); + "IntToPtr Vector length mismatch", &I); } visitInstruction(I); } @@ -4609,7 +4636,7 @@ void Verifier::visitEHPadPredecessors(Instruction &I) { } // The edge may exit from zero or more nested pads. - SmallSet<Value *, 8> Seen; + SmallPtrSet<Value *, 8> Seen; for (;; FromPad = getParentPad(FromPad)) { Check(FromPad != ToPad, "EH pad cannot handle exceptions raised within it", FromPad, TI); @@ -4737,7 +4764,7 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) { User *FirstUser = nullptr; Value *FirstUnwindPad = nullptr; SmallVector<FuncletPadInst *, 8> Worklist({&FPI}); - SmallSet<FuncletPadInst *, 8> Seen; + SmallPtrSet<FuncletPadInst *, 8> Seen; while (!Worklist.empty()) { FuncletPadInst *CurrentPad = Worklist.pop_back_val(); @@ -6612,6 +6639,36 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { "Value for inactive lanes must be a VGPR function argument", &Call); break; } + case Intrinsic::amdgcn_call_whole_wave: { + auto F = dyn_cast<Function>(Call.getArgOperand(0)); + Check(F, "Indirect whole wave calls are not allowed", &Call); + + CallingConv::ID CC = F->getCallingConv(); + Check(CC == CallingConv::AMDGPU_Gfx_WholeWave, + "Callee must have the amdgpu_gfx_whole_wave calling convention", + &Call); + + Check(!F->isVarArg(), "Variadic whole wave calls are not allowed", &Call); + + Check(Call.arg_size() == F->arg_size(), + "Call argument count must match callee argument count", &Call); + + // The first argument of the call is the callee, and the first argument of + // the callee is the active mask. The rest of the arguments must match. + Check(F->arg_begin()->getType()->isIntegerTy(1), + "Callee must have i1 as its first argument", &Call); + for (auto [CallArg, FuncArg] : + drop_begin(zip_equal(Call.args(), F->args()))) { + Check(CallArg->getType() == FuncArg.getType(), + "Argument types must match", &Call); + + // Check that inreg attributes match between call site and function + Check(Call.paramHasAttr(FuncArg.getArgNo(), Attribute::InReg) == + FuncArg.hasInRegAttr(), + "Argument inreg attributes must match", &Call); + } + break; + } case Intrinsic::amdgcn_s_prefetch_data: { Check( AMDGPU::isFlatGlobalAddrSpace( @@ -6770,7 +6827,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { } case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: { - Value *Ptr = Call.getArgOperand(1); + Value *Ptr = Call.getArgOperand(0); Check(isa<AllocaInst>(Ptr) || isa<PoisonValue>(Ptr), "llvm.lifetime.start/end can only be used on alloca or poison", &Call); diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 0323b4d..35d24c1 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1422,7 +1422,7 @@ SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) { for (RTLIB::LibcallImpl Impl : LibcallImpls) { if (Impl != RTLIB::Unsupported) - LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl)); + LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl).data()); } return LibcallSymbols; diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index a466ce5..d6c15de 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -1133,8 +1133,11 @@ void IRLinker::linkNamedMDNodes() { NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName()); // Add Src elements into Dest node. - for (const MDNode *Op : NMD.operands()) - DestNMD->addOperand(Mapper.mapMDNode(*Op)); + for (const MDNode *Op : NMD.operands()) { + MDNode *MD = Mapper.mapMDNode(*Op); + if (!is_contained(DestNMD->operands(), MD)) + DestNMD->addOperand(MD); + } } } diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt index 18a85b3..1e1d0a6 100644 --- a/llvm/lib/MC/CMakeLists.txt +++ b/llvm/lib/MC/CMakeLists.txt @@ -45,6 +45,7 @@ add_llvm_component_library(LLVMMC MCSection.cpp MCSectionMachO.cpp MCStreamer.cpp + MCSFrame.cpp MCSPIRVStreamer.cpp MCSubtargetInfo.cpp MCSymbol.cpp diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index 8f3814a..759d3e0 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -541,12 +541,12 @@ void ELFWriter::computeSymbolTable(const RevGroupMapTy &RevGroupMap) { if (Symbol.isAbsolute()) { MSD.SectionIndex = ELF::SHN_ABS; } else if (Symbol.isCommon()) { - if (Symbol.isTargetCommon()) { - MSD.SectionIndex = Symbol.getIndex(); - } else { + auto Shndx = Symbol.getIndex(); + if (!Shndx) { assert(!Local); - MSD.SectionIndex = ELF::SHN_COMMON; + Shndx = ELF::SHN_COMMON; } + MSD.SectionIndex = Shndx; } else if (Symbol.isUndefined()) { if (Symbol.isSignature() && !Symbol.isUsedInReloc()) { MSD.SectionIndex = RevGroupMap.lookup(&Symbol); diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp index 3b629cd..d68f4af 100644 --- a/llvm/lib/MC/GOFFObjectWriter.cpp +++ b/llvm/lib/MC/GOFFObjectWriter.cpp @@ -17,7 +17,6 @@ #include "llvm/MC/MCSectionGOFF.h" #include "llvm/MC/MCSymbolGOFF.h" #include "llvm/MC/MCValue.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ConvertEBCDIC.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Endian.h" diff --git a/llvm/lib/MC/MCAsmInfoGOFF.cpp b/llvm/lib/MC/MCAsmInfoGOFF.cpp index 0a5d1927..092736b 100644 --- a/llvm/lib/MC/MCAsmInfoGOFF.cpp +++ b/llvm/lib/MC/MCAsmInfoGOFF.cpp @@ -62,6 +62,8 @@ static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode, OS << ','; OS << "RMODE("; switch (Rmode) { + case GOFF::ESD_RMODE_None: + llvm_unreachable(""); case GOFF::ESD_RMODE_24: OS << "24"; break; @@ -71,8 +73,6 @@ static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode, case GOFF::ESD_RMODE_64: OS << "64"; break; - case GOFF::ESD_RMODE_None: - break; } OS << ')'; } diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index 9a5e070..89e541a 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -28,7 +28,6 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbolXCOFF.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormattedStream.h" diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp index 5e364e9..1d211a1 100644 --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -42,7 +42,6 @@ #include "llvm/MC/MCSymbolXCOFF.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/SectionKind.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MemoryBuffer.h" diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp index 275e76e..2881d7c 100644 --- a/llvm/lib/MC/MCELFStreamer.cpp +++ b/llvm/lib/MC/MCELFStreamer.cpp @@ -29,7 +29,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/LEB128.h" #include <cassert> diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp index 6cbdf74..21da79b 100644 --- a/llvm/lib/MC/MCFragment.cpp +++ b/llvm/lib/MC/MCFragment.cpp @@ -68,6 +68,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { OS << "\n Fixup @" << F.getOffset() << " Value:"; F.getValue()->print(OS, nullptr); OS << " Kind:" << F.getKind(); + if (F.isLinkerRelaxable()) + OS << " LinkerRelaxable"; } }; diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp index 4ac73ab..d505ac6 100644 --- a/llvm/lib/MC/MCObjectFileInfo.cpp +++ b/llvm/lib/MC/MCObjectFileInfo.cpp @@ -10,6 +10,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/BinaryFormat/SFrame.h" #include "llvm/BinaryFormat/Wasm.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -23,7 +24,6 @@ #include "llvm/MC/MCSectionSPIRV.h" #include "llvm/MC/MCSectionWasm.h" #include "llvm/MC/MCSectionXCOFF.h" -#include "llvm/Support/Casting.h" #include "llvm/TargetParser/Triple.h" using namespace llvm; @@ -380,6 +380,19 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) { unsigned EHSectionType = T.getArch() == Triple::x86_64 ? ELF::SHT_X86_64_UNWIND : ELF::SHT_PROGBITS; + switch (T.getArch()) { + case Triple::x86_64: + SFrameABIArch = sframe::ABI::AMD64EndianLittle; + break; + case Triple::aarch64: + SFrameABIArch = sframe::ABI::AArch64EndianLittle; + break; + case Triple::aarch64_be: + SFrameABIArch = sframe::ABI::AArch64EndianBig; + break; + default: + break; + } // Solaris requires different flags for .eh_frame to seemingly every other // platform. @@ -537,6 +550,9 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) { EHFrameSection = Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags); + SFrameSection = + Ctx->getELFSection(".sframe", ELF::SHT_GNU_SFRAME, ELF::SHF_ALLOC); + CallGraphSection = Ctx->getELFSection(".callgraph", ELF::SHT_PROGBITS, 0); StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0); @@ -1064,6 +1080,7 @@ void MCObjectFileInfo::initMCObjectFileInfo(MCContext &MCCtx, bool PIC, CompactUnwindDwarfEHFrameOnly = 0; EHFrameSection = nullptr; // Created on demand. + SFrameSection = nullptr; // Created on demand. CompactUnwindSection = nullptr; // Used only by selected targets. DwarfAccelNamesSection = nullptr; // Used only by selected targets. DwarfAccelObjCSection = nullptr; // Used only by selected targets. diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index d0c6144..59265bc 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -17,6 +17,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSFrame.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" @@ -30,7 +31,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context, : MCStreamer(Context), Assembler(std::make_unique<MCAssembler>( Context, std::move(TAB), std::move(Emitter), std::move(OW))), - EmitEHFrame(true), EmitDebugFrame(false) { + EmitEHFrame(true), EmitDebugFrame(false), EmitSFrame(false) { assert(Assembler->getBackendPtr() && Assembler->getEmitterPtr()); IsObj = true; setAllowAutoPadding(Assembler->getBackend().allowAutoPadding()); @@ -186,6 +187,10 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) { if (EmitDebugFrame) MCDwarfFrameEmitter::Emit(*this, MAB, false); + + if (EmitSFrame || (getContext().getTargetOptions() && + getContext().getTargetOptions()->EmitSFrameUnwind)) + MCSFrameEmitter::emit(*this); } void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) { @@ -461,11 +466,23 @@ void MCObjectStreamer::emitInstToFragment(const MCInst &Inst, getAssembler().getEmitter().encodeInstruction(Inst, Data, Fixups, STI); F->Kind = MCFragment::FT_Relaxable; - F->STI = &STI; - F->HasInstructions = true; + F->setHasInstructions(STI); + F->setVarContents(Data); - F->setVarFixups(Fixups); F->setInst(Inst); + + bool MarkedLinkerRelaxable = false; + for (auto &Fixup : Fixups) { + if (!Fixup.isLinkerRelaxable() || MarkedLinkerRelaxable) + continue; + MarkedLinkerRelaxable = true; + auto *Sec = F->getParent(); + if (!Sec->isLinkerRelaxable()) + Sec->setFirstLinkerRelaxable(F->getLayoutOrder()); + F->setLinkerRelaxable(); + } + F->setVarFixups(Fixups); + newFragment(); } diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp index 229b0b8..1bb617b 100644 --- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp +++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp @@ -18,7 +18,6 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbolCOFF.h" #include "llvm/MC/SectionKind.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/SMLoc.h" #include <cstdint> #include <utility> diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index 6782c4b..513f3b3 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -22,7 +22,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/SectionKind.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/SMLoc.h" #include <cassert> #include <cstdint> diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp index 6c2d241..ddfe1e1 100644 --- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp +++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp @@ -26,7 +26,6 @@ #include "llvm/MC/MCSectionWasm.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbolWasm.h" -#include "llvm/Support/Casting.h" #include <optional> using namespace llvm; diff --git a/llvm/lib/MC/MCSFrame.cpp b/llvm/lib/MC/MCSFrame.cpp new file mode 100644 index 0000000..447f22e --- /dev/null +++ b/llvm/lib/MC/MCSFrame.cpp @@ -0,0 +1,98 @@ +//===- lib/MC/MCSFrame.cpp - MCSFrame implementation ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/MC/MCSFrame.h" +#include "llvm/BinaryFormat/SFrame.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/EndianStream.h" + +using namespace llvm; +using namespace sframe; + +namespace { + +// Emitting these field-by-field, instead of constructing the actual structures +// lets Streamer do target endian-fixups for free. + +class SFrameEmitterImpl { + MCObjectStreamer &Streamer; + ABI SFrameABI; + MCSymbol *FDESubSectionStart; + MCSymbol *FRESubSectionStart; + MCSymbol *FRESubSectionEnd; + +public: + SFrameEmitterImpl(MCObjectStreamer &Streamer) : Streamer(Streamer) { + assert(Streamer.getContext() + .getObjectFileInfo() + ->getSFrameABIArch() + .has_value()); + SFrameABI = *Streamer.getContext().getObjectFileInfo()->getSFrameABIArch(); + FDESubSectionStart = Streamer.getContext().createTempSymbol(); + FRESubSectionStart = Streamer.getContext().createTempSymbol(); + FRESubSectionEnd = Streamer.getContext().createTempSymbol(); + } + + void emitPreamble() { + Streamer.emitInt16(Magic); + Streamer.emitInt8(static_cast<uint8_t>(Version::V2)); + Streamer.emitInt8(0); + } + + void emitHeader() { + emitPreamble(); + // sfh_abi_arch + Streamer.emitInt8(static_cast<uint8_t>(SFrameABI)); + // sfh_cfa_fixed_fp_offset + Streamer.emitInt8(0); + // sfh_cfa_fixed_ra_offset + Streamer.emitInt8(0); + // sfh_auxhdr_len + Streamer.emitInt8(0); + // shf_num_fdes + Streamer.emitInt32(0); + // shf_num_fres + Streamer.emitInt32(0); + // shf_fre_len + Streamer.emitAbsoluteSymbolDiff(FRESubSectionEnd, FRESubSectionStart, + sizeof(int32_t)); + // shf_fdeoff. With no sfh_auxhdr, these immediately follow this header. + Streamer.emitInt32(0); + // shf_freoff + Streamer.emitAbsoluteSymbolDiff(FRESubSectionStart, FDESubSectionStart, + sizeof(uint32_t)); + } + + void emitFDEs() { Streamer.emitLabel(FDESubSectionStart); } + + void emitFREs() { + Streamer.emitLabel(FRESubSectionStart); + Streamer.emitLabel(FRESubSectionEnd); + } +}; + +} // end anonymous namespace + +void MCSFrameEmitter::emit(MCObjectStreamer &Streamer) { + MCContext &Context = Streamer.getContext(); + SFrameEmitterImpl Emitter(Streamer); + + MCSection *Section = Context.getObjectFileInfo()->getSFrameSection(); + // Not strictly necessary, but gas always aligns to 8, so match that. + Section->ensureMinAlignment(Align(8)); + Streamer.switchSection(Section); + MCSymbol *SectionStart = Context.createTempSymbol(); + Streamer.emitLabel(SectionStart); + Emitter.emitHeader(); + Emitter.emitFDEs(); + Emitter.emitFREs(); +} diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp index 9ed6fd1..a668e79 100644 --- a/llvm/lib/MC/MCSection.cpp +++ b/llvm/lib/MC/MCSection.cpp @@ -39,6 +39,8 @@ LLVM_DUMP_METHOD void MCSection::dump( raw_ostream &OS = errs(); OS << "MCSection Name:" << getName(); + if (isLinkerRelaxable()) + OS << " FirstLinkerRelaxable:" << firstLinkerRelaxable(); for (auto &F : *this) { OS << '\n'; F.dump(); diff --git a/llvm/lib/MC/MCSymbol.cpp b/llvm/lib/MC/MCSymbol.cpp index 8192896..b868738 100644 --- a/llvm/lib/MC/MCSymbol.cpp +++ b/llvm/lib/MC/MCSymbol.cpp @@ -20,6 +20,10 @@ using namespace llvm; +// There are numerous MCSymbol objects, so keeping sizeof(MCSymbol) small is +// crucial for minimizing peak memory usage. +static_assert(sizeof(MCSymbol) <= 24, "Keep the base symbol small"); + // Only the address of this fragment is ever actually used. static MCFragment SentinelFragment; @@ -44,13 +48,12 @@ void *MCSymbol::operator new(size_t s, const MCSymbolTableEntry *Name, } void MCSymbol::setVariableValue(const MCExpr *Value) { - assert(Value && "Invalid variable value!"); - assert((SymbolContents == SymContentsUnset || - SymbolContents == SymContentsVariable) && - "Cannot give common/offset symbol a variable value"); + assert(Value && "Invalid equated expression"); + assert((kind == Kind::Regular || kind == Kind::Equated) && + "Cannot equate a common symbol"); this->Value = Value; - SymbolContents = SymContentsVariable; - setUndefined(); + kind = Kind::Equated; + Fragment = nullptr; } void MCSymbol::print(raw_ostream &OS, const MCAsmInfo *MAI) const { diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp index 9c8b224..070b3d9 100644 --- a/llvm/lib/MC/MCWasmStreamer.cpp +++ b/llvm/lib/MC/MCWasmStreamer.cpp @@ -22,7 +22,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolWasm.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" namespace llvm { diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp index a45936b..2e632de 100644 --- a/llvm/lib/MC/MCWinCOFFStreamer.cpp +++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp @@ -30,7 +30,6 @@ #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/MCWinCOFFObjectWriter.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SMLoc.h" diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp index a0e3dba..684e05a 100644 --- a/llvm/lib/MC/MCXCOFFStreamer.cpp +++ b/llvm/lib/MC/MCXCOFFStreamer.cpp @@ -21,7 +21,6 @@ #include "llvm/MC/MCSymbolXCOFF.h" #include "llvm/MC/MCXCOFFObjectWriter.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/Casting.h" using namespace llvm; diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp index 13917ba..fce6b2a 100644 --- a/llvm/lib/MC/XCOFFObjectWriter.cpp +++ b/llvm/lib/MC/XCOFFObjectWriter.cpp @@ -20,7 +20,6 @@ #include "llvm/MC/MCValue.h" #include "llvm/MC/MCXCOFFObjectWriter.h" #include "llvm/MC/StringTableBuilder.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" diff --git a/llvm/lib/MCA/Instruction.cpp b/llvm/lib/MCA/Instruction.cpp index d4adfce..7966708 100644 --- a/llvm/lib/MCA/Instruction.cpp +++ b/llvm/lib/MCA/Instruction.cpp @@ -128,6 +128,13 @@ void WriteState::dump() const { } #endif +#ifndef NDEBUG +void ReadState::dump() const { + dbgs() << "{ OpIdx=" << RD->OpIndex << ", RegID " << getRegisterID() + << ", Cycles Left=" << CyclesLeft << " }"; +} +#endif + const CriticalDependency &Instruction::computeCriticalRegDep() { if (CriticalRegDep.Cycles) return CriticalRegDep; diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp index 0f19495..0043f02 100644 --- a/llvm/lib/Object/IRSymtab.cpp +++ b/llvm/lib/Object/IRSymtab.cpp @@ -46,7 +46,7 @@ static cl::opt<bool> DisableBitcodeVersionUpgrade( "disable-bitcode-version-upgrade", cl::Hidden, cl::desc("Disable automatic bitcode upgrade for version mismatch")); -static const char *PreservedSymbols[] = { +static constexpr StringLiteral PreservedSymbols[] = { // There are global variables, so put it here instead of in // RuntimeLibcalls.td. // TODO: Are there similar such variables? @@ -54,6 +54,10 @@ static const char *PreservedSymbols[] = { "__stack_chk_guard", }; +static bool isPreservedGlobalVarName(StringRef Name) { + return PreservedSymbols[0] == Name || PreservedSymbols[1] == Name; +} + namespace { const char *getExpectedProducerName() { @@ -81,12 +85,16 @@ struct Builder { // The StringTableBuilder does not create a copy of any strings added to it, // so this provides somewhere to store any strings that we create. Builder(SmallVector<char, 0> &Symtab, StringTableBuilder &StrtabBuilder, - BumpPtrAllocator &Alloc) - : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc) {} + BumpPtrAllocator &Alloc, const Triple &TT) + : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc), TT(TT), + Libcalls(TT) {} DenseMap<const Comdat *, int> ComdatMap; Mangler Mang; - Triple TT; + const Triple &TT; + + // FIXME: This shouldn't be here. + RTLIB::RuntimeLibcallsInfo Libcalls; std::vector<storage::Comdat> Comdats; std::vector<storage::Module> Mods; @@ -98,6 +106,10 @@ struct Builder { std::vector<storage::Str> DependentLibraries; + bool isPreservedLibFuncName(StringRef Name) { + return Libcalls.getSupportedLibcallImpl(Name) != RTLIB::Unsupported; + } + void setStr(storage::Str &S, StringRef Value) { S.Offset = StrtabBuilder.add(Value); S.Size = Value.size(); @@ -213,19 +225,6 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) { return P.first->second; } -static StringSet<> buildPreservedSymbolsSet(const Triple &TT) { - StringSet<> PreservedSymbolSet; - PreservedSymbolSet.insert(std::begin(PreservedSymbols), - std::end(PreservedSymbols)); - // FIXME: Do we need to pass in ABI fields from TargetOptions? - RTLIB::RuntimeLibcallsInfo Libcalls(TT); - for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) { - if (Impl != RTLIB::Unsupported) - PreservedSymbolSet.insert(Libcalls.getLibcallImplName(Impl)); - } - return PreservedSymbolSet; -} - Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, const SmallPtrSet<GlobalValue *, 4> &Used, ModuleSymbolTable::Symbol Msym) { @@ -279,13 +278,11 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, return Error::success(); } - setStr(Sym.IRName, GV->getName()); - - static const StringSet<> PreservedSymbolsSet = - buildPreservedSymbolsSet(GV->getParent()->getTargetTriple()); - bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName()); + StringRef GVName = GV->getName(); + setStr(Sym.IRName, GVName); - if (Used.count(GV) || IsPreservedSymbol) + if (Used.count(GV) || isPreservedLibFuncName(GVName) || + isPreservedGlobalVarName(GVName)) Sym.Flags |= 1 << storage::Symbol::FB_used; if (GV->isThreadLocal()) Sym.Flags |= 1 << storage::Symbol::FB_tls; @@ -352,7 +349,6 @@ Error Builder::build(ArrayRef<Module *> IRMods) { setStr(Hdr.Producer, kExpectedProducerName); setStr(Hdr.TargetTriple, IRMods[0]->getTargetTriple().str()); setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName()); - TT = IRMods[0]->getTargetTriple(); for (auto *M : IRMods) if (Error Err = addModule(M)) @@ -378,7 +374,8 @@ Error Builder::build(ArrayRef<Module *> IRMods) { Error irsymtab::build(ArrayRef<Module *> Mods, SmallVector<char, 0> &Symtab, StringTableBuilder &StrtabBuilder, BumpPtrAllocator &Alloc) { - return Builder(Symtab, StrtabBuilder, Alloc).build(Mods); + const Triple &TT = Mods[0]->getTargetTriple(); + return Builder(Symtab, StrtabBuilder, Alloc, TT).build(Mods); } // Upgrade a vector of bitcode modules created by an old version of LLVM by diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp index 5863490..759b579 100644 --- a/llvm/lib/Object/SFrameParser.cpp +++ b/llvm/lib/Object/SFrameParser.cpp @@ -32,14 +32,25 @@ getDataSlice(ArrayRef<uint8_t> Data, uint64_t Offset, uint64_t Size) { } template <typename T> -static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data, - uint64_t Offset) { +static Expected<ArrayRef<T>> +getDataSliceAsArrayOf(ArrayRef<uint8_t> Data, uint64_t Offset, uint64_t Count) { static_assert(std::is_trivial_v<T>); - Expected<ArrayRef<uint8_t>> Slice = getDataSlice(Data, Offset, sizeof(T)); + Expected<ArrayRef<uint8_t>> Slice = + getDataSlice(Data, Offset, sizeof(T) * Count); if (!Slice) return Slice.takeError(); - return *reinterpret_cast<const T *>(Slice->data()); + return ArrayRef(reinterpret_cast<const T *>(Slice->data()), Count); +} + +template <typename T> +static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data, + uint64_t Offset) { + Expected<ArrayRef<T>> Array = getDataSliceAsArrayOf<T>(Data, Offset, 1); + if (!Array) + return Array.takeError(); + + return Array->front(); } template <endianness E> @@ -87,17 +98,134 @@ uint64_t SFrameParser<E>::getAbsoluteStartAddress( uint64_t Result = SectionAddress + FDE->StartAddress; if ((getPreamble().Flags.value() & sframe::Flags::FDEFuncStartPCRel) == - sframe::Flags::FDEFuncStartPCRel) { - uintptr_t DataPtr = reinterpret_cast<uintptr_t>(Data.data()); - uintptr_t FDEPtr = reinterpret_cast<uintptr_t>(&*FDE); + sframe::Flags::FDEFuncStartPCRel) + Result += offsetOf(FDE); + + return Result; +} - assert(DataPtr <= FDEPtr && FDEPtr < DataPtr + Data.size() && - "Iterator does not belong to this object!"); +template <endianness E> +uint64_t SFrameParser<E>::offsetOf(typename FDERange::iterator FDE) const { + uintptr_t DataPtr = reinterpret_cast<uintptr_t>(Data.data()); + uintptr_t FDEPtr = reinterpret_cast<uintptr_t>(&*FDE); + + assert(DataPtr <= FDEPtr && FDEPtr < DataPtr + Data.size() && + "Iterator does not belong to this object!"); + return FDEPtr - DataPtr; +} - Result += FDEPtr - DataPtr; +template <typename EndianT> +static Error readArray(ArrayRef<uint8_t> Data, uint64_t Count, uint64_t &Offset, + SmallVectorImpl<int32_t> &Vec) { + Expected<ArrayRef<EndianT>> RawArray = + getDataSliceAsArrayOf<EndianT>(Data, Offset, Count); + if (!RawArray) + return RawArray.takeError(); + Offset += Count * sizeof(EndianT); + Vec.resize(Count); + llvm::copy(*RawArray, Vec.begin()); + return Error::success(); +} + +template <typename T, endianness E> +static Error readFRE(ArrayRef<uint8_t> Data, uint64_t &Offset, + typename SFrameParser<E>::FrameRowEntry &FRE) { + Expected<sframe::FrameRowEntry<T, E>> RawFRE = + getDataSliceAs<sframe::FrameRowEntry<T, E>>(Data, Offset); + if (!RawFRE) + return RawFRE.takeError(); + + Offset += sizeof(*RawFRE); + FRE.StartAddress = RawFRE->StartAddress; + FRE.Info.Info = RawFRE->Info.Info; + + switch (FRE.Info.getOffsetSize()) { + case sframe::FREOffset::B1: + return readArray<sframe::detail::packed<int8_t, E>>( + Data, FRE.Info.getOffsetCount(), Offset, FRE.Offsets); + case sframe::FREOffset::B2: + return readArray<sframe::detail::packed<int16_t, E>>( + Data, FRE.Info.getOffsetCount(), Offset, FRE.Offsets); + case sframe::FREOffset::B4: + return readArray<sframe::detail::packed<int32_t, E>>( + Data, FRE.Info.getOffsetCount(), Offset, FRE.Offsets); } + return createError(formatv("unsupported FRE offset size {0} at offset {1:x+}", + static_cast<unsigned>(FRE.Info.getOffsetSize()), + Offset)); +} - return Result; +template <endianness E> Error SFrameParser<E>::FallibleFREIterator::inc() { + if (++Idx == Size) + return Error::success(); + + switch (FREType) { + case sframe::FREType::Addr1: + return readFRE<uint8_t, E>(Data, Offset, FRE); + case sframe::FREType::Addr2: + return readFRE<uint16_t, E>(Data, Offset, FRE); + case sframe::FREType::Addr4: + return readFRE<uint32_t, E>(Data, Offset, FRE); + } + return createError(formatv("unsupported FRE type {0} at offset {1:x+}", + static_cast<unsigned>(FREType), Offset)); +} + +template <endianness E> +iterator_range<typename SFrameParser<E>::fre_iterator> +SFrameParser<E>::fres(const sframe::FuncDescEntry<E> &FDE, Error &Err) const { + uint64_t Offset = getFREBase() + FDE.StartFREOff; + fre_iterator BeforeBegin = make_fallible_itr( + FallibleFREIterator(Data, FDE.Info.getFREType(), -1, FDE.NumFREs, Offset), + Err); + fre_iterator End = make_fallible_end( + FallibleFREIterator(Data, FDE.Info.getFREType(), FDE.NumFREs, FDE.NumFREs, + /*Offset=*/0)); + return {++BeforeBegin, End}; +} + +static std::optional<int32_t> getOffset(ArrayRef<int32_t> Offsets, size_t Idx) { + if (Offsets.size() > Idx) + return Offsets[Idx]; + return std::nullopt; +} + +// The interpretation of offsets is ABI-specific. The implementation of this and +// the following functions may need to be adjusted when adding support for a new +// ABI. +template <endianness E> +std::optional<int32_t> +SFrameParser<E>::getCFAOffset(const FrameRowEntry &FRE) const { + return getOffset(FRE.Offsets, 0); +} + +template <endianness E> +std::optional<int32_t> +SFrameParser<E>::getRAOffset(const FrameRowEntry &FRE) const { + if (usesFixedRAOffset()) + return Header.CFAFixedRAOffset; + return getOffset(FRE.Offsets, 1); +} + +template <endianness E> +std::optional<int32_t> +SFrameParser<E>::getFPOffset(const FrameRowEntry &FRE) const { + if (usesFixedFPOffset()) + return Header.CFAFixedFPOffset; + return getOffset(FRE.Offsets, usesFixedRAOffset() ? 1 : 2); +} + +template <endianness E> +ArrayRef<int32_t> +SFrameParser<E>::getExtraOffsets(const FrameRowEntry &FRE) const { + size_t UsedOffsets = 1; // CFA + if (!usesFixedRAOffset()) + ++UsedOffsets; + if (!usesFixedFPOffset()) + ++UsedOffsets; + if (FRE.Offsets.size() > UsedOffsets) + return ArrayRef<int32_t>(FRE.Offsets).drop_front(UsedOffsets); + return {}; } template class LLVM_EXPORT_TEMPLATE llvm::object::SFrameParser<endianness::big>; diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index f810368..b7edeea 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -124,7 +124,6 @@ #include "llvm/CodeGen/MachineCopyPropagation.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineLICM.h" #include "llvm/CodeGen/MachineLateInstrsCleanup.h" #include "llvm/CodeGen/MachinePassManager.h" diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp index cdf4412..fc2577e 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp @@ -519,7 +519,7 @@ Error InstrProfSymtab::create(SectionRef &Section) { return Error::success(); } -StringRef InstrProfSymtab::getFuncName(uint64_t Pointer, size_t Size) { +StringRef InstrProfSymtab::getFuncName(uint64_t Pointer, size_t Size) const { if (Pointer < Address) return StringRef(); auto Offset = Pointer - Address; diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 5425729..7885e12 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -684,13 +684,13 @@ Error InstrProfSymtab::addFuncWithName(Function &F, StringRef PGOFuncName, return Error::success(); } -uint64_t InstrProfSymtab::getVTableHashFromAddress(uint64_t Address) { +uint64_t InstrProfSymtab::getVTableHashFromAddress(uint64_t Address) const { // Given a runtime address, look up the hash value in the interval map, and // fallback to value 0 if a hash value is not found. return VTableAddrMap.lookup(Address, 0); } -uint64_t InstrProfSymtab::getFunctionHashFromAddress(uint64_t Address) { +uint64_t InstrProfSymtab::getFunctionHashFromAddress(uint64_t Address) const { finalizeSymtab(); auto It = partition_point(AddrToMD5Map, [=](std::pair<uint64_t, uint64_t> A) { return A.first < Address; diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index 7ca26aa..a347351 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -13,7 +13,6 @@ #include "llvm/ProfileData/InstrProfWriter.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/ProfileSummary.h" #include "llvm/ProfileData/DataAccessProf.h" @@ -331,61 +330,34 @@ void InstrProfWriter::addDataAccessProfData( DataAccessProfileData = std::move(DataAccessProfDataIn); } -void InstrProfWriter::addTemporalProfileTrace(TemporalProfTraceTy Trace) { - assert(Trace.FunctionNameRefs.size() <= MaxTemporalProfTraceLength); - assert(!Trace.FunctionNameRefs.empty()); - if (TemporalProfTraceStreamSize < TemporalProfTraceReservoirSize) { - // Simply append the trace if we have not yet hit our reservoir size limit. - TemporalProfTraces.push_back(std::move(Trace)); - } else { - // Otherwise, replace a random trace in the stream. - std::uniform_int_distribution<uint64_t> Distribution( - 0, TemporalProfTraceStreamSize); - uint64_t RandomIndex = Distribution(RNG); - if (RandomIndex < TemporalProfTraces.size()) - TemporalProfTraces[RandomIndex] = std::move(Trace); - } - ++TemporalProfTraceStreamSize; -} - void InstrProfWriter::addTemporalProfileTraces( SmallVectorImpl<TemporalProfTraceTy> &SrcTraces, uint64_t SrcStreamSize) { + if (TemporalProfTraces.size() > TemporalProfTraceReservoirSize) + TemporalProfTraces.truncate(TemporalProfTraceReservoirSize); for (auto &Trace : SrcTraces) if (Trace.FunctionNameRefs.size() > MaxTemporalProfTraceLength) Trace.FunctionNameRefs.resize(MaxTemporalProfTraceLength); llvm::erase_if(SrcTraces, [](auto &T) { return T.FunctionNameRefs.empty(); }); - // Assume that the source has the same reservoir size as the destination to - // avoid needing to record it in the indexed profile format. - bool IsDestSampled = - (TemporalProfTraceStreamSize > TemporalProfTraceReservoirSize); - bool IsSrcSampled = (SrcStreamSize > TemporalProfTraceReservoirSize); - if (!IsDestSampled && IsSrcSampled) { - // If one of the traces are sampled, ensure that it belongs to Dest. - std::swap(TemporalProfTraces, SrcTraces); - std::swap(TemporalProfTraceStreamSize, SrcStreamSize); - std::swap(IsDestSampled, IsSrcSampled); - } - if (!IsSrcSampled) { - // If the source stream is not sampled, we add each source trace normally. - for (auto &Trace : SrcTraces) - addTemporalProfileTrace(std::move(Trace)); + // If there are no source traces, it is probably because + // --temporal-profile-max-trace-length=0 was set to deliberately remove all + // traces. In that case, we do not want to increase the stream size + if (SrcTraces.empty()) return; - } - // Otherwise, we find the traces that would have been removed if we added - // the whole source stream. - SmallSetVector<uint64_t, 8> IndicesToReplace; - for (uint64_t I = 0; I < SrcStreamSize; I++) { - std::uniform_int_distribution<uint64_t> Distribution( - 0, TemporalProfTraceStreamSize); + // Add traces until our reservoir is full or we run out of source traces + auto SrcTraceIt = SrcTraces.begin(); + while (TemporalProfTraces.size() < TemporalProfTraceReservoirSize && + SrcTraceIt < SrcTraces.end()) + TemporalProfTraces.push_back(*SrcTraceIt++); + // Our reservoir is full, we need to sample the source stream + llvm::shuffle(SrcTraceIt, SrcTraces.end(), RNG); + for (uint64_t I = TemporalProfTraces.size(); + I < SrcStreamSize && SrcTraceIt < SrcTraces.end(); I++) { + std::uniform_int_distribution<uint64_t> Distribution(0, I); uint64_t RandomIndex = Distribution(RNG); if (RandomIndex < TemporalProfTraces.size()) - IndicesToReplace.insert(RandomIndex); - ++TemporalProfTraceStreamSize; + TemporalProfTraces[RandomIndex] = *SrcTraceIt++; } - // Then we insert a random sample of the source traces. - llvm::shuffle(SrcTraces.begin(), SrcTraces.end(), RNG); - for (const auto &[Index, Trace] : llvm::zip(IndicesToReplace, SrcTraces)) - TemporalProfTraces[Index] = std::move(Trace); + TemporalProfTraceStreamSize += SrcStreamSize; } void InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW, diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp index fe34037..70ac68a 100644 --- a/llvm/lib/SandboxIR/Context.cpp +++ b/llvm/lib/SandboxIR/Context.cpp @@ -256,6 +256,7 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { case llvm::Instruction::FPToUI: case llvm::Instruction::FPToSI: case llvm::Instruction::FPExt: + case llvm::Instruction::PtrToAddr: case llvm::Instruction::PtrToInt: case llvm::Instruction::IntToPtr: case llvm::Instruction::SIToFP: diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp index 956047c..1a81d18 100644 --- a/llvm/lib/SandboxIR/Instruction.cpp +++ b/llvm/lib/SandboxIR/Instruction.cpp @@ -1007,6 +1007,9 @@ static llvm::Instruction::CastOps getLLVMCastOp(Instruction::Opcode Opc) { return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPToSI); case Instruction::Opcode::FPExt: return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPExt); + case Instruction::Opcode::PtrToAddr: + return static_cast<llvm::Instruction::CastOps>( + llvm::Instruction::PtrToAddr); case Instruction::Opcode::PtrToInt: return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::PtrToInt); case Instruction::Opcode::IntToPtr: diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 3d688a1..d2a417f 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -5519,13 +5519,129 @@ APFloat::opStatus DoubleAPFloat::next(bool nextDown) { return opOK; } +APFloat::opStatus DoubleAPFloat::convertToSignExtendedInteger( + MutableArrayRef<integerPart> Input, unsigned int Width, bool IsSigned, + roundingMode RM, bool *IsExact) const { + assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + + // If Hi is not finite, or Lo is zero, the value is entirely represented + // by Hi. Delegate to the simpler single-APFloat conversion. + if (!getFirst().isFiniteNonZero() || getSecond().isZero()) + return getFirst().convertToInteger(Input, Width, IsSigned, RM, IsExact); + + // First, round the full double-double value to an integral value. This + // simplifies the rest of the function, as we no longer need to consider + // fractional parts. + *IsExact = false; + DoubleAPFloat Integral = *this; + const opStatus RoundStatus = Integral.roundToIntegral(RM); + if (RoundStatus == opInvalidOp) + return opInvalidOp; + const APFloat &IntegralHi = Integral.getFirst(); + const APFloat &IntegralLo = Integral.getSecond(); + + // If rounding results in either component being zero, the sum is trivial. + // Delegate to the simpler single-APFloat conversion. + bool HiIsExact; + if (IntegralHi.isZero() || IntegralLo.isZero()) { + const opStatus HiStatus = + IntegralHi.convertToInteger(Input, Width, IsSigned, RM, &HiIsExact); + // The conversion from an integer-valued float to an APInt may fail if the + // result would be out of range. Regardless, taking this path is only + // possible if rounding occurred during the initial `roundToIntegral`. + return HiStatus == opOK ? opInexact : HiStatus; + } + + // A negative number cannot be represented by an unsigned integer. + // Since a double-double is canonical, if Hi is negative, the sum is negative. + if (!IsSigned && IntegralHi.isNegative()) + return opInvalidOp; + + // Handle the special boundary case where |Hi| is exactly the power of two + // that marks the edge of the integer's range (e.g., 2^63 for int64_t). In + // this situation, Hi itself won't fit, but the sum Hi + Lo might. + // `PositiveOverflowWidth` is the bit number for this boundary (N-1 for + // signed, N for unsigned). + bool LoIsExact; + const int HiExactLog2 = IntegralHi.getExactLog2Abs(); + const unsigned PositiveOverflowWidth = IsSigned ? Width - 1 : Width; + if (HiExactLog2 >= 0 && + static_cast<unsigned>(HiExactLog2) == PositiveOverflowWidth) { + // If Hi and Lo have the same sign, |Hi + Lo| > |Hi|, so the sum is + // guaranteed to overflow. E.g., for uint128_t, (2^128, 1) overflows. + if (IntegralHi.isNegative() == IntegralLo.isNegative()) + return opInvalidOp; + + // If the signs differ, the sum will fit. We can compute the result using + // properties of two's complement arithmetic without a wide intermediate + // integer. E.g., for uint128_t, (2^128, -1) should be 2^128 - 1. + const opStatus LoStatus = IntegralLo.convertToInteger( + Input, Width, /*IsSigned=*/true, RM, &LoIsExact); + if (LoStatus == opInvalidOp) + return opInvalidOp; + + // Adjust the bit pattern of Lo to account for Hi's value: + // - For unsigned (Hi=2^Width): `2^Width + Lo` in `Width`-bit + // arithmetic is equivalent to just `Lo`. The conversion of `Lo` above + // already produced the correct final bit pattern. + // - For signed (Hi=2^(Width-1)): The sum `2^(Width-1) + Lo` (where Lo<0) + // can be computed by taking the two's complement pattern for `Lo` and + // clearing the sign bit. + if (IsSigned && !IntegralHi.isNegative()) + APInt::tcClearBit(Input.data(), PositiveOverflowWidth); + *IsExact = RoundStatus == opOK; + return RoundStatus; + } + + // Convert Hi into an integer. This may not fit but that is OK: we know that + // Hi + Lo would not fit either in this situation. + const opStatus HiStatus = IntegralHi.convertToInteger( + Input, Width, IsSigned, rmTowardZero, &HiIsExact); + if (HiStatus == opInvalidOp) + return HiStatus; + + // Convert Lo into a temporary integer of the same width. + APSInt LoResult{Width, /*isUnsigned=*/!IsSigned}; + const opStatus LoStatus = + IntegralLo.convertToInteger(LoResult, rmTowardZero, &LoIsExact); + if (LoStatus == opInvalidOp) + return LoStatus; + + // Add Lo to Hi. This addition is guaranteed not to overflow because of the + // double-double canonicalization rule (`|Lo| <= ulp(Hi)/2`). The only case + // where the sum could cross the integer type's boundary is when Hi is a + // power of two, which is handled by the special case block above. + APInt::tcAdd(Input.data(), LoResult.getRawData(), /*carry=*/0, Input.size()); + + *IsExact = RoundStatus == opOK; + return RoundStatus; +} + APFloat::opStatus DoubleAPFloat::convertToInteger(MutableArrayRef<integerPart> Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - return APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt()) - .convertToInteger(Input, Width, IsSigned, RM, IsExact); + opStatus FS = + convertToSignExtendedInteger(Input, Width, IsSigned, RM, IsExact); + + if (FS == opInvalidOp) { + const unsigned DstPartsCount = partCountForBits(Width); + assert(DstPartsCount <= Input.size() && "Integer too big"); + + unsigned Bits; + if (getCategory() == fcNaN) + Bits = 0; + else if (isNegative()) + Bits = IsSigned; + else + Bits = Width - IsSigned; + + tcSetLeastSignificantBits(Input.data(), DstPartsCount, Bits); + if (isNegative() && IsSigned) + APInt::tcShiftLeft(Input.data(), DstPartsCount, Width - 1); + } + + return FS; } APFloat::opStatus DoubleAPFloat::convertFromAPInt(const APInt &Input, @@ -5626,14 +5742,30 @@ bool DoubleAPFloat::getExactInverse(APFloat *inv) const { return Ret; } -int DoubleAPFloat::getExactLog2() const { - // TODO: Implement me - return INT_MIN; -} - int DoubleAPFloat::getExactLog2Abs() const { - // TODO: Implement me - return INT_MIN; + // In order for Hi + Lo to be a power of two, the following must be true: + // 1. Hi must be a power of two. + // 2. Lo must be zero. + if (getSecond().isNonZero()) + return INT_MIN; + return getFirst().getExactLog2Abs(); +} + +int ilogb(const DoubleAPFloat &Arg) { + const APFloat &Hi = Arg.getFirst(); + const APFloat &Lo = Arg.getSecond(); + int IlogbResult = ilogb(Hi); + // Zero and non-finite values can delegate to ilogb(Hi). + if (Arg.getCategory() != fcNormal) + return IlogbResult; + // If Lo can't change the binade, we can delegate to ilogb(Hi). + if (Lo.isZero() || Hi.isNegative() == Lo.isNegative()) + return IlogbResult; + if (Hi.getExactLog2Abs() == INT_MIN) + return IlogbResult; + // Numbers of the form 2^a - 2^b or -2^a + 2^b are almost powers of two but + // get nudged out of the binade by the low component. + return IlogbResult - 1; } DoubleAPFloat scalbn(const DoubleAPFloat &Arg, int Exp, @@ -5646,10 +5778,101 @@ DoubleAPFloat scalbn(const DoubleAPFloat &Arg, int Exp, DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp, APFloat::roundingMode RM) { assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat First = frexp(Arg.Floats[0], Exp, RM); - APFloat Second = Arg.Floats[1]; - if (Arg.getCategory() == APFloat::fcNormal) - Second = scalbn(Second, -Exp, RM); + + // Get the unbiased exponent e of the number, where |Arg| = m * 2^e for m in + // [1.0, 2.0). + Exp = ilogb(Arg); + + // For NaNs, quiet any signaling NaN and return the result, as per standard + // practice. + if (Exp == APFloat::IEK_NaN) { + DoubleAPFloat Quiet{Arg}; + Quiet.getFirst().makeQuiet(); + return Quiet; + } + + // For infinity, return it unchanged. The exponent remains IEK_Inf. + if (Exp == APFloat::IEK_Inf) + return Arg; + + // For zero, the fraction is zero and the standard requires the exponent be 0. + if (Exp == APFloat::IEK_Zero) { + Exp = 0; + return Arg; + } + + const APFloat &Hi = Arg.getFirst(); + const APFloat &Lo = Arg.getSecond(); + + // frexp requires the fraction's absolute value to be in [0.5, 1.0). + // ilogb provides an exponent for an absolute value in [1.0, 2.0). + // Increment the exponent to ensure the fraction is in the correct range. + ++Exp; + + const bool SignsDisagree = Hi.isNegative() != Lo.isNegative(); + APFloat Second = Lo; + if (Arg.getCategory() == APFloat::fcNormal && Lo.isFiniteNonZero()) { + roundingMode LoRoundingMode; + // The interpretation of rmTowardZero depends on the sign of the combined + // Arg rather than the sign of the component. + if (RM == rmTowardZero) + LoRoundingMode = Arg.isNegative() ? rmTowardPositive : rmTowardNegative; + // For rmNearestTiesToAway, we face a similar problem. If signs disagree, + // Lo is a correction *toward* zero relative to Hi. Rounding Lo + // "away from zero" based on its own sign would move the value in the + // wrong direction. As a safe proxy, we use rmNearestTiesToEven, which is + // direction-agnostic. We only need to bother with this if Lo is scaled + // down. + else if (RM == rmNearestTiesToAway && SignsDisagree && Exp > 0) + LoRoundingMode = rmNearestTiesToEven; + else + LoRoundingMode = RM; + Second = scalbn(Lo, -Exp, LoRoundingMode); + // The rmNearestTiesToEven proxy is correct most of the time, but it + // differs from rmNearestTiesToAway when the scaled value of Lo is an + // exact midpoint. + // NOTE: This is morally equivalent to roundTiesTowardZero. + if (RM == rmNearestTiesToAway && LoRoundingMode == rmNearestTiesToEven) { + // Re-scale the result back to check if rounding occurred. + const APFloat RecomposedLo = scalbn(Second, Exp, rmNearestTiesToEven); + if (RecomposedLo != Lo) { + // RoundingError tells us which direction we rounded: + // - RoundingError > 0: we rounded up. + // - RoundingError < 0: we down up. + const APFloat RoundingError = RecomposedLo - Lo; + // Determine if scalbn(Lo, -Exp) landed exactly on a midpoint. + // We do this by checking if the absolute rounding error is exactly + // half a ULP of the result. + const APFloat UlpOfSecond = harrisonUlp(Second); + const APFloat ScaledUlpOfSecond = + scalbn(UlpOfSecond, Exp - 1, rmNearestTiesToEven); + const bool IsMidpoint = abs(RoundingError) == ScaledUlpOfSecond; + const bool RoundedLoAway = + Second.isNegative() == RoundingError.isNegative(); + // The sign of Hi and Lo disagree and we rounded Lo away: we must + // decrease the magnitude of Second to increase the magnitude + // First+Second. + if (IsMidpoint && RoundedLoAway) + Second.next(/*nextDown=*/!Second.isNegative()); + } + } + // Handle a tricky edge case where Arg is slightly less than a power of two + // (e.g., Arg = 2^k - epsilon). In this situation: + // 1. Hi is 2^k, and Lo is a small negative value -epsilon. + // 2. ilogb(Arg) correctly returns k-1. + // 3. Our initial Exp becomes (k-1) + 1 = k. + // 4. Scaling Hi (2^k) by 2^-k would yield a magnitude of 1.0 and + // scaling Lo by 2^-k would yield zero. This would make the result 1.0 + // which is an invalid fraction, as the required interval is [0.5, 1.0). + // We detect this specific case by checking if Hi is a power of two and if + // the scaled Lo underflowed to zero. The fix: Increment Exp to k+1. This + // adjusts the scale factor, causing Hi to be scaled to 0.5, which is a + // valid fraction. + if (Second.isZero() && SignsDisagree && Hi.getExactLog2Abs() != INT_MIN) + ++Exp; + } + + APFloat First = scalbn(Hi, -Exp, RM); return DoubleAPFloat(semPPCDoubleDouble, std::move(First), std::move(Second)); } @@ -5749,10 +5972,6 @@ void APFloat::Profile(FoldingSetNodeID &NID) const { NID.Add(bitcastToAPInt()); } -/* Same as convertToInteger(integerPart*, ...), except the result is returned in - an APSInt, whose initial bit-width and signed-ness are used to determine the - precision of the conversion. - */ APFloat::opStatus APFloat::convertToInteger(APSInt &result, roundingMode rounding_mode, bool *isExact) const { diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp index 954af7f..0c0e1d0 100644 --- a/llvm/lib/Support/APInt.cpp +++ b/llvm/lib/Support/APInt.cpp @@ -1377,7 +1377,7 @@ static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r, // the true value, and a "borrow" to the left should be remembered. int64_t borrow = 0; for (unsigned i = 0; i < n; ++i) { - uint64_t p = uint64_t(qp) * uint64_t(v[i]); + uint64_t p = qp * uint64_t(v[i]); int64_t subres = int64_t(u[j+i]) - borrow - Lo_32(p); u[j+i] = Lo_32(subres); borrow = Hi_32(p) - Hi_32(subres); @@ -3136,6 +3136,22 @@ APInt APIntOps::mulhu(const APInt &C1, const APInt &C2) { return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth()); } +APInt APIntOps::mulsExtended(const APInt &C1, const APInt &C2) { + assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths"); + unsigned FullWidth = C1.getBitWidth() * 2; + APInt C1Ext = C1.sext(FullWidth); + APInt C2Ext = C2.sext(FullWidth); + return C1Ext * C2Ext; +} + +APInt APIntOps::muluExtended(const APInt &C1, const APInt &C2) { + assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths"); + unsigned FullWidth = C1.getBitWidth() * 2; + APInt C1Ext = C1.zext(FullWidth); + APInt C2Ext = C2.zext(FullWidth); + return C1Ext * C2Ext; +} + APInt APIntOps::pow(const APInt &X, int64_t N) { assert(N >= 0 && "negative exponents not supported."); APInt Acc = APInt(X.getBitWidth(), 1); diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 10b6101..b7578dd 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -182,6 +182,7 @@ add_llvm_component_library(LLVMSupport DivisionByConstantInfo.cpp DAGDeltaAlgorithm.cpp DJB.cpp + DXILABI.cpp DynamicAPInt.cpp ELFAttributes.cpp ELFAttrParserCompact.cpp diff --git a/llvm/lib/Support/DXILABI.cpp b/llvm/lib/Support/DXILABI.cpp new file mode 100644 index 0000000..082e320 --- /dev/null +++ b/llvm/lib/Support/DXILABI.cpp @@ -0,0 +1,33 @@ +//===-- DXILABI.cpp - ABI Sensitive Values for DXIL -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains definitions of various constants and enums that are
+// required to remain stable as per the DXIL format's requirements.
+//
+// Documentation for DXIL can be found in
+// https://github.com/Microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DXILABI.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+StringRef dxil::getResourceClassName(dxil::ResourceClass RC) {
+ switch (RC) {
+ case dxil::ResourceClass::SRV:
+ return "SRV";
+ case dxil::ResourceClass::UAV:
+ return "UAV";
+ case dxil::ResourceClass::CBuffer:
+ return "CBV";
+ case dxil::ResourceClass::Sampler:
+ return "Sampler";
+ }
+ llvm_unreachable("Invalid ResourceClass enum value");
+}
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp index 94a04ab..bd08365 100644 --- a/llvm/lib/Support/KnownBits.cpp +++ b/llvm/lib/Support/KnownBits.cpp @@ -888,11 +888,19 @@ KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS, Res.Zero |= (~BottomKnown).getLoBits(ResultBitsKnown); Res.One = BottomKnown.getLoBits(ResultBitsKnown); - // If we're self-multiplying then bit[1] is guaranteed to be zero. - if (NoUndefSelfMultiply && BitWidth > 1) { - assert(Res.One[1] == 0 && - "Self-multiplication failed Quadratic Reciprocity!"); - Res.Zero.setBit(1); + if (NoUndefSelfMultiply) { + // If X has at least TZ trailing zeroes, then bit (2 * TZ + 1) must be zero. + unsigned TwoTZP1 = 2 * TrailZero0 + 1; + if (TwoTZP1 < BitWidth) + Res.Zero.setBit(TwoTZP1); + + // If X has exactly TZ trailing zeros, then bit (2 * TZ + 2) must also be + // zero. + if (TrailZero0 < BitWidth && LHS.One[TrailZero0]) { + unsigned TwoTZP2 = TwoTZP1 + 1; + if (TwoTZP2 < BitWidth) + Res.Zero.setBit(TwoTZP2); + } } return Res; diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index 601f11f..1c4645a 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -501,8 +501,14 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize, std::unique_ptr<MB> Result( new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile<MB>( RequiresNullTerminator, FD, MapSize, Offset, EC)); - if (!EC) - return std::move(Result); + if (!EC) { + // On at least Linux, and possibly on other systems, mmap may return pages + // from the page cache that are not properly filled with trailing zeroes, + // if some prior user of the page wrote non-zero bytes. Detect this and + // don't use mmap in that case. + if (!RequiresNullTerminator || *Result->getBufferEnd() == '\0') + return std::move(Result); + } } #ifdef __MVS__ diff --git a/llvm/lib/Support/SmallPtrSet.cpp b/llvm/lib/Support/SmallPtrSet.cpp index 83143a7..a602165 100644 --- a/llvm/lib/Support/SmallPtrSet.cpp +++ b/llvm/lib/Support/SmallPtrSet.cpp @@ -13,6 +13,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/MemAlloc.h" #include <algorithm> @@ -28,7 +29,7 @@ void SmallPtrSetImplBase::shrink_and_clear() { // Reduce the number of buckets. unsigned Size = size(); CurArraySize = Size > 16 ? 1 << (Log2_32_Ceil(Size) + 1) : 32; - NumNonEmpty = NumTombstones = 0; + NumEntries = NumTombstones = 0; // Install the new array. Clear all the buckets to empty. CurArray = (const void**)safe_malloc(sizeof(void*) * CurArraySize); @@ -41,7 +42,8 @@ SmallPtrSetImplBase::insert_imp_big(const void *Ptr) { if (LLVM_UNLIKELY(size() * 4 >= CurArraySize * 3)) { // If more than 3/4 of the array is full, grow. Grow(CurArraySize < 64 ? 128 : CurArraySize * 2); - } else if (LLVM_UNLIKELY(CurArraySize - NumNonEmpty < CurArraySize / 8)) { + } else if (LLVM_UNLIKELY(CurArraySize - NumEntries - NumTombstones < + CurArraySize / 8)) { // If fewer of 1/8 of the array is empty (meaning that many are filled with // tombstones), rehash. Grow(CurArraySize); @@ -55,8 +57,7 @@ SmallPtrSetImplBase::insert_imp_big(const void *Ptr) { // Otherwise, insert it! if (*Bucket == getTombstoneMarker()) --NumTombstones; - else - ++NumNonEmpty; // Track density. + ++NumEntries; *Bucket = Ptr; incrementEpoch(); return std::make_pair(Bucket, true); @@ -110,8 +111,7 @@ const void *const *SmallPtrSetImplBase::FindBucketFor(const void *Ptr) const { /// Grow - Allocate a larger backing store for the buckets and move it over. /// void SmallPtrSetImplBase::Grow(unsigned NewSize) { - const void **OldBuckets = CurArray; - const void **OldEnd = EndPointer(); + auto OldBuckets = buckets(); bool WasSmall = isSmall(); // Install the new array. Clear all the buckets to empty. @@ -123,16 +123,14 @@ void SmallPtrSetImplBase::Grow(unsigned NewSize) { memset(CurArray, -1, NewSize*sizeof(void*)); // Copy over all valid entries. - for (const void **BucketPtr = OldBuckets; BucketPtr != OldEnd; ++BucketPtr) { + for (const void *&Bucket : OldBuckets) { // Copy over the element if it is valid. - const void *Elt = *BucketPtr; - if (Elt != getTombstoneMarker() && Elt != getEmptyMarker()) - *const_cast<void**>(FindBucketFor(Elt)) = const_cast<void*>(Elt); + if (Bucket != getTombstoneMarker() && Bucket != getEmptyMarker()) + *const_cast<void **>(FindBucketFor(Bucket)) = const_cast<void *>(Bucket); } if (!WasSmall) - free(OldBuckets); - NumNonEmpty -= NumTombstones; + free(OldBuckets.begin()); NumTombstones = 0; IsSmall = false; } @@ -193,9 +191,9 @@ void SmallPtrSetImplBase::copyHelper(const SmallPtrSetImplBase &RHS) { CurArraySize = RHS.CurArraySize; // Copy over the contents from the other set - std::copy(RHS.CurArray, RHS.EndPointer(), CurArray); + llvm::copy(RHS.buckets(), CurArray); - NumNonEmpty = RHS.NumNonEmpty; + NumEntries = RHS.NumEntries; NumTombstones = RHS.NumTombstones; } @@ -217,7 +215,7 @@ void SmallPtrSetImplBase::moveHelper(const void **SmallStorage, if (RHS.isSmall()) { // Copy a small RHS rather than moving. CurArray = SmallStorage; - std::copy(RHS.CurArray, RHS.CurArray + RHS.NumNonEmpty, CurArray); + llvm::copy(RHS.small_buckets(), CurArray); } else { CurArray = RHS.CurArray; RHS.CurArray = RHSSmallStorage; @@ -225,13 +223,13 @@ void SmallPtrSetImplBase::moveHelper(const void **SmallStorage, // Copy the rest of the trivial members. CurArraySize = RHS.CurArraySize; - NumNonEmpty = RHS.NumNonEmpty; + NumEntries = RHS.NumEntries; NumTombstones = RHS.NumTombstones; IsSmall = RHS.IsSmall; // Make the RHS small and empty. RHS.CurArraySize = SmallSize; - RHS.NumNonEmpty = 0; + RHS.NumEntries = 0; RHS.NumTombstones = 0; RHS.IsSmall = true; } @@ -245,54 +243,42 @@ void SmallPtrSetImplBase::swap(const void **SmallStorage, if (!this->isSmall() && !RHS.isSmall()) { std::swap(this->CurArray, RHS.CurArray); std::swap(this->CurArraySize, RHS.CurArraySize); - std::swap(this->NumNonEmpty, RHS.NumNonEmpty); + std::swap(this->NumEntries, RHS.NumEntries); std::swap(this->NumTombstones, RHS.NumTombstones); return; } // FIXME: From here on we assume that both sets have the same small size. - // If only RHS is small, copy the small elements into LHS and move the pointer - // from LHS to RHS. - if (!this->isSmall() && RHS.isSmall()) { - std::copy(RHS.CurArray, RHS.CurArray + RHS.NumNonEmpty, SmallStorage); - std::swap(RHS.CurArraySize, this->CurArraySize); - std::swap(this->NumNonEmpty, RHS.NumNonEmpty); + // Both a small, just swap the small elements. + if (this->isSmall() && RHS.isSmall()) { + unsigned MinEntries = std::min(this->NumEntries, RHS.NumEntries); + std::swap_ranges(this->CurArray, this->CurArray + MinEntries, RHS.CurArray); + if (this->NumEntries > MinEntries) { + std::copy(this->CurArray + MinEntries, this->CurArray + this->NumEntries, + RHS.CurArray + MinEntries); + } else { + std::copy(RHS.CurArray + MinEntries, RHS.CurArray + RHS.NumEntries, + this->CurArray + MinEntries); + } + assert(this->CurArraySize == RHS.CurArraySize); + std::swap(this->NumEntries, RHS.NumEntries); std::swap(this->NumTombstones, RHS.NumTombstones); - RHS.CurArray = this->CurArray; - RHS.IsSmall = false; - this->CurArray = SmallStorage; - this->IsSmall = true; return; } - // If only LHS is small, copy the small elements into RHS and move the pointer - // from RHS to LHS. - if (this->isSmall() && !RHS.isSmall()) { - std::copy(this->CurArray, this->CurArray + this->NumNonEmpty, - RHSSmallStorage); - std::swap(RHS.CurArraySize, this->CurArraySize); - std::swap(RHS.NumNonEmpty, this->NumNonEmpty); - std::swap(RHS.NumTombstones, this->NumTombstones); - this->CurArray = RHS.CurArray; - this->IsSmall = false; - RHS.CurArray = RHSSmallStorage; - RHS.IsSmall = true; - return; - } - - // Both a small, just swap the small elements. - assert(this->isSmall() && RHS.isSmall()); - unsigned MinNonEmpty = std::min(this->NumNonEmpty, RHS.NumNonEmpty); - std::swap_ranges(this->CurArray, this->CurArray + MinNonEmpty, RHS.CurArray); - if (this->NumNonEmpty > MinNonEmpty) { - std::copy(this->CurArray + MinNonEmpty, this->CurArray + this->NumNonEmpty, - RHS.CurArray + MinNonEmpty); - } else { - std::copy(RHS.CurArray + MinNonEmpty, RHS.CurArray + RHS.NumNonEmpty, - this->CurArray + MinNonEmpty); - } - assert(this->CurArraySize == RHS.CurArraySize); - std::swap(this->NumNonEmpty, RHS.NumNonEmpty); - std::swap(this->NumTombstones, RHS.NumTombstones); + // If only one side is small, copy the small elements into the large side and + // move the pointer from the large side to the small side. + SmallPtrSetImplBase &SmallSide = this->isSmall() ? *this : RHS; + SmallPtrSetImplBase &LargeSide = this->isSmall() ? RHS : *this; + const void **LargeSideInlineStorage = + this->isSmall() ? RHSSmallStorage : SmallStorage; + llvm::copy(SmallSide.small_buckets(), LargeSideInlineStorage); + std::swap(LargeSide.CurArraySize, SmallSide.CurArraySize); + std::swap(LargeSide.NumEntries, SmallSide.NumEntries); + std::swap(LargeSide.NumTombstones, SmallSide.NumTombstones); + SmallSide.CurArray = LargeSide.CurArray; + SmallSide.IsSmall = false; + LargeSide.CurArray = LargeSideInlineStorage; + LargeSide.IsSmall = true; } diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc index cc02cae..31fb1e8 100644 --- a/llvm/lib/Support/Unix/Path.inc +++ b/llvm/lib/Support/Unix/Path.inc @@ -876,6 +876,12 @@ void mapped_file_region::unmapImpl() { ::munmap(Mapping, Size); } +std::error_code mapped_file_region::sync() const { + if (int Res = ::msync(Mapping, Size, MS_SYNC)) + return std::error_code(Res, std::generic_category()); + return std::error_code(); +} + void mapped_file_region::dontNeedImpl() { assert(Mode == mapped_file_region::readonly); if (!Mapping) diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc index fdf9d54..9001c19 100644 --- a/llvm/lib/Support/Windows/Path.inc +++ b/llvm/lib/Support/Windows/Path.inc @@ -1006,6 +1006,14 @@ void mapped_file_region::unmapImpl() { void mapped_file_region::dontNeedImpl() {} +std::error_code mapped_file_region::sync() const { + if (!::FlushViewOfFile(Mapping, Size)) + return mapWindowsError(GetLastError()); + if (!::FlushFileBuffers(FileHandle)) + return mapWindowsError(GetLastError()); + return std::error_code(); +} + int mapped_file_region::alignment() { SYSTEM_INFO SysInfo; ::GetSystemInfo(&SysInfo); diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index 3f318e2..67622a9 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -3064,11 +3064,11 @@ const Init *Record::getValueInit(StringRef FieldName) const { } StringRef Record::getValueAsString(StringRef FieldName) const { - std::optional<StringRef> S = getValueAsOptionalString(FieldName); - if (!S) - PrintFatalError(getLoc(), "Record `" + getName() + - "' does not have a field named `" + FieldName + "'!\n"); - return *S; + const Init *I = getValueInit(FieldName); + if (const auto *SI = dyn_cast<StringInit>(I)) + return SI->getValue(); + PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + + "' exists but does not have a string value"); } std::optional<StringRef> @@ -3088,24 +3088,16 @@ Record::getValueAsOptionalString(StringRef FieldName) const { } const BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const { - const RecordVal *R = getValue(FieldName); - if (!R || !R->getValue()) - PrintFatalError(getLoc(), "Record `" + getName() + - "' does not have a field named `" + FieldName + "'!\n"); - - if (const auto *BI = dyn_cast<BitsInit>(R->getValue())) + const Init *I = getValueInit(FieldName); + if (const auto *BI = dyn_cast<BitsInit>(I)) return BI; PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' exists but does not have a bits value"); } const ListInit *Record::getValueAsListInit(StringRef FieldName) const { - const RecordVal *R = getValue(FieldName); - if (!R || !R->getValue()) - PrintFatalError(getLoc(), "Record `" + getName() + - "' does not have a field named `" + FieldName + "'!\n"); - - if (const auto *LI = dyn_cast<ListInit>(R->getValue())) + const Init *I = getValueInit(FieldName); + if (const auto *LI = dyn_cast<ListInit>(I)) return LI; PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' exists but does not have a list value"); @@ -3127,17 +3119,13 @@ Record::getValueAsListOfDefs(StringRef FieldName) const { } int64_t Record::getValueAsInt(StringRef FieldName) const { - const RecordVal *R = getValue(FieldName); - if (!R || !R->getValue()) - PrintFatalError(getLoc(), "Record `" + getName() + - "' does not have a field named `" + FieldName + "'!\n"); - - if (const auto *II = dyn_cast<IntInit>(R->getValue())) + const Init *I = getValueInit(FieldName); + if (const auto *II = dyn_cast<IntInit>(I)) return II->getValue(); - PrintFatalError(getLoc(), Twine("Record `") + getName() + "', field `" + - FieldName + - "' exists but does not have an int value: " + - R->getValue()->getAsString()); + PrintFatalError( + getLoc(), + Twine("Record `") + getName() + "', field `" + FieldName + + "' exists but does not have an int value: " + I->getAsString()); } std::vector<int64_t> @@ -3173,67 +3161,47 @@ Record::getValueAsListOfStrings(StringRef FieldName) const { } const Record *Record::getValueAsDef(StringRef FieldName) const { - const RecordVal *R = getValue(FieldName); - if (!R || !R->getValue()) - PrintFatalError(getLoc(), "Record `" + getName() + - "' does not have a field named `" + FieldName + "'!\n"); - - if (const auto *DI = dyn_cast<DefInit>(R->getValue())) + const Init *I = getValueInit(FieldName); + if (const auto *DI = dyn_cast<DefInit>(I)) return DI->getDef(); PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' does not have a def initializer!"); } const Record *Record::getValueAsOptionalDef(StringRef FieldName) const { - const RecordVal *R = getValue(FieldName); - if (!R || !R->getValue()) - PrintFatalError(getLoc(), "Record `" + getName() + - "' does not have a field named `" + FieldName + "'!\n"); - - if (const auto *DI = dyn_cast<DefInit>(R->getValue())) + const Init *I = getValueInit(FieldName); + if (const auto *DI = dyn_cast<DefInit>(I)) return DI->getDef(); - if (isa<UnsetInit>(R->getValue())) + if (isa<UnsetInit>(I)) return nullptr; PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' does not have either a def initializer or '?'!"); } bool Record::getValueAsBit(StringRef FieldName) const { - const RecordVal *R = getValue(FieldName); - if (!R || !R->getValue()) - PrintFatalError(getLoc(), "Record `" + getName() + - "' does not have a field named `" + FieldName + "'!\n"); - - if (const auto *BI = dyn_cast<BitInit>(R->getValue())) + const Init *I = getValueInit(FieldName); + if (const auto *BI = dyn_cast<BitInit>(I)) return BI->getValue(); PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' does not have a bit initializer!"); } bool Record::getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const { - const RecordVal *R = getValue(FieldName); - if (!R || !R->getValue()) - PrintFatalError(getLoc(), "Record `" + getName() + - "' does not have a field named `" + FieldName.str() + "'!\n"); - - if (isa<UnsetInit>(R->getValue())) { + const Init *I = getValueInit(FieldName); + if (isa<UnsetInit>(I)) { Unset = true; return false; } Unset = false; - if (const auto *BI = dyn_cast<BitInit>(R->getValue())) + if (const auto *BI = dyn_cast<BitInit>(I)) return BI->getValue(); PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' does not have a bit initializer!"); } const DagInit *Record::getValueAsDag(StringRef FieldName) const { - const RecordVal *R = getValue(FieldName); - if (!R || !R->getValue()) - PrintFatalError(getLoc(), "Record `" + getName() + - "' does not have a field named `" + FieldName + "'!\n"); - - if (const auto *DI = dyn_cast<DagInit>(R->getValue())) + const Init *I = getValueInit(FieldName); + if (const auto *DI = dyn_cast<DagInit>(I)) return DI; PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' does not have a dag initializer!"); diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index 5496ebd..8d0ff41 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -60,6 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); FunctionPass *createSMEABIPass(); FunctionPass *createSMEPeepholeOptPass(); +FunctionPass *createMachineSMEABIPass(); ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, @@ -111,6 +112,7 @@ void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); void initializeSMEABIPass(PassRegistry &); void initializeSMEPeepholeOptPass(PassRegistry &); +void initializeMachineSMEABIPass(PassRegistry &); void initializeSVEIntrinsicOptsPass(PassRegistry &); void initializeAArch64Arm64ECCallLoweringPass(PassRegistry &); } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index ad8368e..1169f26 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -316,6 +316,12 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType( ThunkArgTranslation::PointerIndirection}; }; + if (T->isHalfTy()) { + // Prefix with `llvm` since MSVC doesn't specify `_Float16` + Out << "__llvm_h__"; + return direct(T); + } + if (T->isFloatTy()) { Out << "f"; return direct(T); @@ -327,8 +333,8 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType( } if (T->isFloatingPointTy()) { - report_fatal_error( - "Only 32 and 64 bit floating points are supported for ARM64EC thunks"); + report_fatal_error("Only 16, 32, and 64 bit floating points are supported " + "for ARM64EC thunks"); } auto &DL = M->getDataLayout(); @@ -342,8 +348,16 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType( uint64_t ElementCnt = T->getArrayNumElements(); uint64_t ElementSizePerBytes = DL.getTypeSizeInBits(ElementTy) / 8; uint64_t TotalSizeBytes = ElementCnt * ElementSizePerBytes; - if (ElementTy->isFloatTy() || ElementTy->isDoubleTy()) { - Out << (ElementTy->isFloatTy() ? "F" : "D") << TotalSizeBytes; + if (ElementTy->isHalfTy() || ElementTy->isFloatTy() || + ElementTy->isDoubleTy()) { + if (ElementTy->isHalfTy()) + // Prefix with `llvm` since MSVC doesn't specify `_Float16` + Out << "__llvm_H__"; + else if (ElementTy->isFloatTy()) + Out << "F"; + else if (ElementTy->isDoubleTy()) + Out << "D"; + Out << TotalSizeBytes; if (Alignment.value() >= 16 && !Ret) Out << "a" << Alignment.value(); if (TotalSizeBytes <= 8) { @@ -355,8 +369,9 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType( return pointerIndirection(T); } } else if (T->isFloatingPointTy()) { - report_fatal_error("Only 32 and 64 bit floating points are supported for " - "ARM64EC thunks"); + report_fatal_error( + "Only 16, 32, and 64 bit floating points are supported " + "for ARM64EC thunks"); } } diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp index 787a1a8..cc46159 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp @@ -75,8 +75,10 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers, auto &It = PendingMembers[0]; CCAssignFn *AssignFn = TLI->CCAssignFnForCall(State.getCallingConv(), /*IsVarArg=*/false); + // FIXME: Get the correct original type. + Type *OrigTy = EVT(It.getValVT()).getTypeForEVT(State.getContext()); if (AssignFn(It.getValNo(), It.getValVT(), It.getValVT(), CCValAssign::Full, - ArgFlags, State)) + ArgFlags, OrigTy, State)) llvm_unreachable("Call operand has unhandled type"); // Return the flags to how they were before. diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/llvm/lib/Target/AArch64/AArch64CallingConvention.h index 63185a9..7105fa6 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.h +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.h @@ -18,52 +18,63 @@ namespace llvm { bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State); } // namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 201bfe0..57dcd68 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -92,8 +92,9 @@ private: bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); - MachineBasicBlock *expandRestoreZA(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI); + MachineBasicBlock * + expandCommitOrRestoreZASave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); MachineBasicBlock *expandCondSMToggle(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); }; @@ -528,6 +529,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( UseRev = true; } break; + case AArch64::Destructive2xRegImmUnpred: + // EXT_ZZI_CONSTRUCTIVE Zd, Zs, Imm + // ==> MOVPRFX Zd Zs; EXT_ZZI Zd, Zd, Zs, Imm + std::tie(DOPIdx, SrcIdx, Src2Idx) = std::make_tuple(1, 1, 2); + break; default: llvm_unreachable("Unsupported Destructive Operand type"); } @@ -548,6 +554,7 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( break; case AArch64::DestructiveUnaryPassthru: case AArch64::DestructiveBinaryImm: + case AArch64::Destructive2xRegImmUnpred: DOPRegIsUnique = true; break; case AArch64::DestructiveTernaryCommWithRev: @@ -674,6 +681,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( .add(MI.getOperand(SrcIdx)) .add(MI.getOperand(Src2Idx)); break; + case AArch64::Destructive2xRegImmUnpred: + DOP.addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState) + .add(MI.getOperand(SrcIdx)) + .add(MI.getOperand(Src2Idx)); + break; } if (PRFX) { @@ -979,10 +991,15 @@ bool AArch64ExpandPseudo::expandStoreSwiftAsyncContext( return true; } -MachineBasicBlock * -AArch64ExpandPseudo::expandRestoreZA(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) { +static constexpr unsigned ZERO_ALL_ZA_MASK = 0b11111111; + +MachineBasicBlock *AArch64ExpandPseudo::expandCommitOrRestoreZASave( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { MachineInstr &MI = *MBBI; + bool IsRestoreZA = MI.getOpcode() == AArch64::RestoreZAPseudo; + assert((MI.getOpcode() == AArch64::RestoreZAPseudo || + MI.getOpcode() == AArch64::CommitZASavePseudo) && + "Expected ZA commit or restore"); assert((std::next(MBBI) != MBB.end() || MI.getParent()->successors().begin() != MI.getParent()->successors().end()) && @@ -990,21 +1007,23 @@ AArch64ExpandPseudo::expandRestoreZA(MachineBasicBlock &MBB, // Compare TPIDR2_EL0 value against 0. DebugLoc DL = MI.getDebugLoc(); - MachineInstrBuilder Cbz = BuildMI(MBB, MBBI, DL, TII->get(AArch64::CBZX)) - .add(MI.getOperand(0)); + MachineInstrBuilder Branch = + BuildMI(MBB, MBBI, DL, + TII->get(IsRestoreZA ? AArch64::CBZX : AArch64::CBNZX)) + .add(MI.getOperand(0)); // Split MBB and create two new blocks: // - MBB now contains all instructions before RestoreZAPseudo. - // - SMBB contains the RestoreZAPseudo instruction only. - // - EndBB contains all instructions after RestoreZAPseudo. + // - SMBB contains the [Commit|RestoreZA]Pseudo instruction only. + // - EndBB contains all instructions after [Commit|RestoreZA]Pseudo. MachineInstr &PrevMI = *std::prev(MBBI); MachineBasicBlock *SMBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true); MachineBasicBlock *EndBB = std::next(MI.getIterator()) == SMBB->end() ? *SMBB->successors().begin() : SMBB->splitAt(MI, /*UpdateLiveIns*/ true); - // Add the SMBB label to the TB[N]Z instruction & create a branch to EndBB. - Cbz.addMBB(SMBB); + // Add the SMBB label to the CB[N]Z instruction & create a branch to EndBB. + Branch.addMBB(SMBB); BuildMI(&MBB, DL, TII->get(AArch64::B)) .addMBB(EndBB); MBB.addSuccessor(EndBB); @@ -1012,11 +1031,30 @@ AArch64ExpandPseudo::expandRestoreZA(MachineBasicBlock &MBB, // Replace the pseudo with a call (BL). MachineInstrBuilder MIB = BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::BL)); - MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit); + // Copy operands (mainly the regmask) from the pseudo. for (unsigned I = 2; I < MI.getNumOperands(); ++I) MIB.add(MI.getOperand(I)); - BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB); + if (IsRestoreZA) { + // Mark the TPIDR2 block pointer (X0) as an implicit use. + MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit); + } else /*CommitZA*/ { + [[maybe_unused]] auto *TRI = + MBB.getParent()->getSubtarget().getRegisterInfo(); + // Clear TPIDR2_EL0. + BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::MSR)) + .addImm(AArch64SysReg::TPIDR2_EL0) + .addReg(AArch64::XZR); + bool ZeroZA = MI.getOperand(1).getImm() != 0; + if (ZeroZA) { + assert(MI.definesRegister(AArch64::ZAB0, TRI) && "should define ZA!"); + BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::ZERO_M)) + .addImm(ZERO_ALL_ZA_MASK) + .addDef(AArch64::ZAB0, RegState::ImplicitDefine); + } + } + + BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB); MI.eraseFromParent(); return EndBB; } @@ -1236,14 +1274,20 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, .add(MI.getOperand(3)); transferImpOps(MI, I, I); } else { + unsigned RegState = + getRenamableRegState(MI.getOperand(1).isRenamable()) | + getKillRegState( + MI.getOperand(1).isKill() && + MI.getOperand(1).getReg() != MI.getOperand(2).getReg() && + MI.getOperand(1).getReg() != MI.getOperand(3).getReg()); BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8 : AArch64::ORRv16i8)) .addReg(DstReg, RegState::Define | getRenamableRegState(MI.getOperand(0).isRenamable())) - .add(MI.getOperand(1)) - .add(MI.getOperand(1)); + .addReg(MI.getOperand(1).getReg(), RegState) + .addReg(MI.getOperand(1).getReg(), RegState); auto I2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 @@ -1629,8 +1673,9 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, return expandCALL_BTI(MBB, MBBI); case AArch64::StoreSwiftAsyncContext: return expandStoreSwiftAsyncContext(MBB, MBBI); + case AArch64::CommitZASavePseudo: case AArch64::RestoreZAPseudo: { - auto *NewMBB = expandRestoreZA(MBB, MBBI); + auto *NewMBB = expandCommitOrRestoreZASave(MBB, MBBI); if (NewMBB != &MBB) NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated. return true; @@ -1641,6 +1686,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated. return true; } + case AArch64::InOutZAUsePseudo: + case AArch64::RequiresZASavePseudo: case AArch64::COALESCER_BARRIER_FPR16: case AArch64::COALESCER_BARRIER_FPR32: case AArch64::COALESCER_BARRIER_FPR64: diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 9d74bb5..cf34498 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -267,7 +267,7 @@ private: private: CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const; bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs, - unsigned &NumBytes); + SmallVectorImpl<Type *> &OrigTys, unsigned &NumBytes); bool finishCall(CallLoweringInfo &CLI, unsigned NumBytes); public: @@ -3011,11 +3011,13 @@ bool AArch64FastISel::fastLowerArguments() { bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &OutVTs, + SmallVectorImpl<Type *> &OrigTys, unsigned &NumBytes) { CallingConv::ID CC = CLI.CallConv; SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context); - CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC)); + CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, OrigTys, + CCAssignFnForCall(CC)); // Get a count of how many bytes are to be pushed on the stack. NumBytes = CCInfo.getStackSize(); @@ -3194,6 +3196,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Set up the argument vectors. SmallVector<MVT, 16> OutVTs; + SmallVector<Type *, 16> OrigTys; OutVTs.reserve(CLI.OutVals.size()); for (auto *Val : CLI.OutVals) { @@ -3207,6 +3210,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { return false; OutVTs.push_back(VT); + OrigTys.push_back(Val->getType()); } Address Addr; @@ -3222,7 +3226,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Handle the arguments now that we've gotten them. unsigned NumBytes; - if (!processCallArgs(CLI, OutVTs, NumBytes)) + if (!processCallArgs(CLI, OutVTs, OrigTys, NumBytes)) return false; const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); @@ -3574,12 +3578,8 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { Args.reserve(II->arg_size()); // Populate the argument list. - for (auto &Arg : II->args()) { - ArgListEntry Entry; - Entry.Val = Arg; - Entry.Ty = Arg->getType(); - Args.push_back(Entry); - } + for (auto &Arg : II->args()) + Args.emplace_back(Arg); CallLoweringInfo CLI; MCContext &Ctx = MF->getContext(); @@ -4870,12 +4870,8 @@ bool AArch64FastISel::selectFRem(const Instruction *I) { Args.reserve(I->getNumOperands()); // Populate the argument list. - for (auto &Arg : I->operands()) { - ArgListEntry Entry; - Entry.Val = Arg; - Entry.Ty = Arg->getType(); - Args.push_back(Entry); - } + for (auto &Arg : I->operands()) + Args.emplace_back(Arg); CallLoweringInfo CLI; MCContext &Ctx = MF->getContext(); diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 885f2a9..fddde66 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1475,24 +1475,26 @@ static bool requiresSaveVG(const MachineFunction &MF) { return true; } -bool isVGInstruction(MachineBasicBlock::iterator MBBI) { +static bool matchLibcall(const TargetLowering &TLI, const MachineOperand &MO, + RTLIB::Libcall LC) { + return MO.isSymbol() && + StringRef(TLI.getLibcallName(LC)) == MO.getSymbolName(); +} + +bool isVGInstruction(MachineBasicBlock::iterator MBBI, + const TargetLowering &TLI) { unsigned Opc = MBBI->getOpcode(); if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI || Opc == AArch64::UBFMXri) return true; - if (requiresGetVGCall(*MBBI->getMF())) { - if (Opc == AArch64::ORRXrr) - return true; + if (!requiresGetVGCall(*MBBI->getMF())) + return false; - if (Opc == AArch64::BL) { - auto Op1 = MBBI->getOperand(0); - return Op1.isSymbol() && - (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg"); - } - } + if (Opc == AArch64::BL) + return matchLibcall(TLI, MBBI->getOperand(0), RTLIB::SMEABI_GET_CURRENT_VG); - return false; + return Opc == AArch64::ORRXrr; } // Convert callee-save register save/restore instruction to do stack pointer @@ -1511,9 +1513,11 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( // functions, we need to do this for both the streaming and non-streaming // vector length. Move past these instructions if necessary. MachineFunction &MF = *MBB.getParent(); - if (requiresSaveVG(MF)) - while (isVGInstruction(MBBI)) + if (requiresSaveVG(MF)) { + auto &TLI = *MF.getSubtarget().getTargetLowering(); + while (isVGInstruction(MBBI, TLI)) ++MBBI; + } switch (MBBI->getOpcode()) { default: @@ -2097,11 +2101,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Move past the saves of the callee-saved registers, fixing up the offsets // and pre-inc if we decided to combine the callee-save and local stack // pointer bump above. + auto &TLI = *MF.getSubtarget().getTargetLowering(); while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) && !IsSVECalleeSave(MBBI)) { if (CombineSPBump && // Only fix-up frame-setup load/store instructions. - (!requiresSaveVG(MF) || !isVGInstruction(MBBI))) + (!requiresSaveVG(MF) || !isVGInstruction(MBBI, TLI))) fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI); ++MBBI; @@ -3468,6 +3473,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); + auto &TLI = *MF.getSubtarget<AArch64Subtarget>().getTargetLowering(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); bool NeedsWinCFI = needsWinCFI(MF); @@ -3581,11 +3587,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( .addReg(AArch64::X0, RegState::Implicit) .setMIFlag(MachineInstr::FrameSetup); - const uint32_t *RegMask = TRI->getCallPreservedMask( - MF, - CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1); + RTLIB::Libcall LC = RTLIB::SMEABI_GET_CURRENT_VG; + const uint32_t *RegMask = + TRI->getCallPreservedMask(MF, TLI.getLibcallCallingConv(LC)); BuildMI(MBB, MI, DL, TII.get(AArch64::BL)) - .addExternalSymbol("__arm_get_current_vg") + .addExternalSymbol(TLI.getLibcallName(LC)) .addRegMask(RegMask) .addReg(AArch64::X0, RegState::ImplicitDefine) .setMIFlag(MachineInstr::FrameSetup); diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index ad42f4b..bc786f4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -7617,16 +7617,29 @@ bool AArch64DAGToDAGISel::SelectAnyPredicate(SDValue N) { bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned MaxSize, SDValue &Base, SDValue &Offset, unsigned Scale) { - // Try to untangle an ADD node into a 'reg + offset' - if (CurDAG->isBaseWithConstantOffset(N)) - if (auto C = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + auto MatchConstantOffset = [&](SDValue CN) -> SDValue { + if (auto *C = dyn_cast<ConstantSDNode>(CN)) { int64_t ImmOff = C->getSExtValue(); - if ((ImmOff > 0 && ImmOff <= MaxSize && (ImmOff % Scale == 0))) { - Base = N.getOperand(0); - Offset = CurDAG->getTargetConstant(ImmOff / Scale, SDLoc(N), MVT::i64); - return true; - } + if ((ImmOff > 0 && ImmOff <= MaxSize && (ImmOff % Scale == 0))) + return CurDAG->getTargetConstant(ImmOff / Scale, SDLoc(N), MVT::i64); } + return SDValue(); + }; + + if (SDValue C = MatchConstantOffset(N)) { + Base = CurDAG->getConstant(0, SDLoc(N), MVT::i32); + Offset = C; + return true; + } + + // Try to untangle an ADD node into a 'reg + offset' + if (CurDAG->isBaseWithConstantOffset(N)) { + if (SDValue C = MatchConstantOffset(N.getOperand(1))) { + Base = N.getOperand(0); + Offset = C; + return true; + } + } // By default, just match reg + 0. Base = N; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3c06c6a..e896370 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17,6 +17,7 @@ #include "AArch64PerfectShuffle.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" +#include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" #include "Utils/AArch64SMEAttributes.h" @@ -1998,6 +1999,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(Op, MVT::f16, Promote); } +const AArch64TargetMachine &AArch64TargetLowering::getTM() const { + return static_cast<const AArch64TargetMachine &>(getTargetMachine()); +} + void AArch64TargetLowering::addTypeForNEON(MVT VT) { assert(VT.isVector() && "VT should be a vector type"); @@ -3083,13 +3088,12 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI, AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); if (FuncInfo->isSMESaveBufferUsed()) { + RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL)) - .addExternalSymbol("__arm_sme_state_size") + .addExternalSymbol(getLibcallName(LC)) .addReg(AArch64::X0, RegState::ImplicitDefine) - .addRegMask(TRI->getCallPreservedMask( - *MF, CallingConv:: - AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)); + .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC))); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) .addReg(AArch64::X0); @@ -3101,6 +3105,30 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI, return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + Register ResultReg = MI.getOperand(0).getReg(); + if (FuncInfo->isPStateSMRegUsed()) { + RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE; + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL)) + .addExternalSymbol(getLibcallName(LC)) + .addReg(AArch64::X0, RegState::ImplicitDefine) + .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC))); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg) + .addReg(AArch64::X0); + } else { + assert(MI.getMF()->getRegInfo().use_empty(ResultReg) && + "Expected no users of the entry pstate.sm!"); + } + MI.eraseFromParent(); + return BB; +} + // Helper function to find the instruction that defined a virtual register. // If unable to find such instruction, returns nullptr. static const MachineInstr *stripVRegCopies(const MachineRegisterInfo &MRI, @@ -3216,6 +3244,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( return EmitAllocateSMESaveBuffer(MI, BB); case AArch64::GetSMESaveSize: return EmitGetSMESaveSize(MI, BB); + case AArch64::EntryPStateSM: + return EmitEntryPStateSM(MI, BB); case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); case TargetOpcode::STATEPOINT: @@ -3320,7 +3350,8 @@ static bool isZerosVector(const SDNode *N) { /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 /// CC -static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { +static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC, + SDValue RHS = {}) { switch (CC) { default: llvm_unreachable("Unknown condition code!"); @@ -3331,9 +3362,9 @@ static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { case ISD::SETGT: return AArch64CC::GT; case ISD::SETGE: - return AArch64CC::GE; + return (RHS && isNullConstant(RHS)) ? AArch64CC::PL : AArch64CC::GE; case ISD::SETLT: - return AArch64CC::LT; + return (RHS && isNullConstant(RHS)) ? AArch64CC::MI : AArch64CC::LT; case ISD::SETLE: return AArch64CC::LE; case ISD::SETUGT: @@ -3492,6 +3523,13 @@ bool isLegalCmpImmed(APInt C) { return isLegalArithImmed(C.abs().getZExtValue()); } +unsigned numberOfInstrToLoadImm(APInt C) { + uint64_t Imm = C.getZExtValue(); + SmallVector<AArch64_IMM::ImmInsnModel> Insn; + AArch64_IMM::expandMOVImm(Imm, 32, Insn); + return Insn.size(); +} + static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG) { // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe. if (Op->getFlags().hasNoSignedWrap()) @@ -3782,7 +3820,7 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, SDLoc DL(Val); // Determine OutCC and handle FP special case. if (isInteger) { - OutCC = changeIntCCToAArch64CC(CC); + OutCC = changeIntCCToAArch64CC(CC, RHS); } else { assert(LHS.getValueType().isFloatingPoint()); AArch64CC::CondCode ExtraCC; @@ -3961,6 +3999,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, // CC has already been adjusted. RHS = DAG.getConstant(0, DL, VT); } else if (!isLegalCmpImmed(C)) { + unsigned NumImmForC = numberOfInstrToLoadImm(C); // Constant does not fit, try adjusting it by one? switch (CC) { default: @@ -3969,43 +4008,49 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, case ISD::SETGE: if (!C.isMinSignedValue()) { APInt CMinusOne = C - 1; - if (isLegalCmpImmed(CMinusOne)) { + if (isLegalCmpImmed(CMinusOne) || + (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) { CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; RHS = DAG.getConstant(CMinusOne, DL, VT); } } break; case ISD::SETULT: - case ISD::SETUGE: - if (!C.isZero()) { - APInt CMinusOne = C - 1; - if (isLegalCmpImmed(CMinusOne)) { - CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; - RHS = DAG.getConstant(CMinusOne, DL, VT); - } + case ISD::SETUGE: { + // C is not 0 because it is a legal immediate. + assert(!C.isZero() && "C should not be zero here"); + APInt CMinusOne = C - 1; + if (isLegalCmpImmed(CMinusOne) || + (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) { + CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; + RHS = DAG.getConstant(CMinusOne, DL, VT); } break; + } case ISD::SETLE: case ISD::SETGT: if (!C.isMaxSignedValue()) { APInt CPlusOne = C + 1; - if (isLegalCmpImmed(CPlusOne)) { + if (isLegalCmpImmed(CPlusOne) || + (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; RHS = DAG.getConstant(CPlusOne, DL, VT); } } break; case ISD::SETULE: - case ISD::SETUGT: + case ISD::SETUGT: { if (!C.isAllOnes()) { APInt CPlusOne = C + 1; - if (isLegalCmpImmed(CPlusOne)) { + if (isLegalCmpImmed(CPlusOne) || + (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; RHS = DAG.getConstant(CPlusOne, DL, VT); } } break; } + } } } @@ -4079,7 +4124,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, if (!Cmp) { Cmp = emitComparison(LHS, RHS, CC, DL, DAG); - AArch64CC = changeIntCCToAArch64CC(CC); + AArch64CC = changeIntCCToAArch64CC(CC, RHS); } AArch64cc = getCondCode(DAG, AArch64CC); return Cmp; @@ -5174,13 +5219,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); ArgListTy Args; - ArgListEntry Entry; - - Entry.Node = Arg; - Entry.Ty = ArgTy; - Entry.IsSExt = false; - Entry.IsZExt = false; - Args.push_back(Entry); + Args.emplace_back(Arg, ArgTy); RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; @@ -5711,15 +5750,15 @@ static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG, SDValue Chain, SDLoc DL, EVT VT) const { - SDValue Callee = DAG.getExternalSymbol("__arm_sme_state", + RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE; + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); Type *Int64Ty = Type::getInt64Ty(*DAG.getContext()); Type *RetTy = StructType::get(Int64Ty, Int64Ty); TargetLowering::CallLoweringInfo CLI(DAG); ArgListTy Args; CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( - CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2, - RetTy, Callee, std::move(Args)); + getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64); return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0), @@ -7886,8 +7925,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( else if (ActualMVT == MVT::i16) ValVT = MVT::i16; } - bool Res = - AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); + bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, + Ins[i].OrigTy, CCInfo); assert(!Res && "Call operand has unhandled type"); (void)Res; } @@ -8132,19 +8171,26 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); + if (Attrs.hasStreamingCompatibleInterface()) { + SDValue EntryPStateSM = + DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL, + DAG.getVTList(MVT::i64, MVT::Other), {Chain}); + + // Copy the value to a virtual register, and save that in FuncInfo. + Register EntryPStateSMReg = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg, + EntryPStateSM); + FuncInfo->setPStateSMReg(EntryPStateSMReg); + } + // Insert the SMSTART if this is a locally streaming function and // make sure it is Glued to the last CopyFromReg value. if (IsLocallyStreaming) { - SDValue PStateSM; - if (Attrs.hasStreamingCompatibleInterface()) { - PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64); - Register Reg = MF.getRegInfo().createVirtualRegister( - getRegClassFor(PStateSM.getValueType().getSimpleVT())); - FuncInfo->setPStateSMReg(Reg); - Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM); + if (Attrs.hasStreamingCompatibleInterface()) Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue, - AArch64SME::IfCallerIsNonStreaming, PStateSM); - } else + AArch64SME::IfCallerIsNonStreaming); + else Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue, AArch64SME::Always); @@ -8244,53 +8290,54 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Subtarget->hasCustomCallingConv()) Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); - // Create a 16 Byte TPIDR2 object. The dynamic buffer - // will be expanded and stored in the static object later using a pseudonode. - if (Attrs.hasZAState()) { - TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); - TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false); - SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, - DAG.getConstant(1, DL, MVT::i32)); - - SDValue Buffer; - if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) { - Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL, - DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL}); - } else { - SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); - Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, - DAG.getVTList(MVT::i64, MVT::Other), - {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); - MFI.CreateVariableSizedObject(Align(16), nullptr); - } - Chain = DAG.getNode( - AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other), - {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)}); - } else if (Attrs.hasAgnosticZAInterface()) { - // Call __arm_sme_state_size(). - SDValue BufferSize = - DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL, - DAG.getVTList(MVT::i64, MVT::Other), Chain); - Chain = BufferSize.getValue(1); - - SDValue Buffer; - if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) { - Buffer = - DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL, - DAG.getVTList(MVT::i64, MVT::Other), {Chain, BufferSize}); - } else { - // Allocate space dynamically. - Buffer = DAG.getNode( - ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other), - {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)}); - MFI.CreateVariableSizedObject(Align(16), nullptr); + if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) { + // Old SME ABI lowering (deprecated): + // Create a 16 Byte TPIDR2 object. The dynamic buffer + // will be expanded and stored in the static object later using a + // pseudonode. + if (Attrs.hasZAState()) { + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); + TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false); + SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); + SDValue Buffer; + if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) { + Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL, + DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL}); + } else { + SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); + Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, + DAG.getVTList(MVT::i64, MVT::Other), + {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + Chain = DAG.getNode( + AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other), + {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)}); + } else if (Attrs.hasAgnosticZAInterface()) { + // Call __arm_sme_state_size(). + SDValue BufferSize = + DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL, + DAG.getVTList(MVT::i64, MVT::Other), Chain); + Chain = BufferSize.getValue(1); + SDValue Buffer; + if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) { + Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL, + DAG.getVTList(MVT::i64, MVT::Other), + {Chain, BufferSize}); + } else { + // Allocate space dynamically. + Buffer = DAG.getNode( + ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other), + {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)}); + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + // Copy the value to a virtual register, and save that in FuncInfo. + Register BufferPtr = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + FuncInfo->setSMESaveBufferAddr(BufferPtr); + Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer); } - - // Copy the value to a virtual register, and save that in FuncInfo. - Register BufferPtr = - MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); - FuncInfo->setSMESaveBufferAddr(BufferPtr); - Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer); } if (CallConv == CallingConv::PreserveNone) { @@ -8307,6 +8354,15 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } } + if (getTM().useNewSMEABILowering()) { + // Clear new ZT0 state. TODO: Move this to the SME ABI pass. + if (Attrs.isNewZT0()) + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, + DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32)); + } + return Chain; } @@ -8557,19 +8613,20 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI, // FIXME: CCAssignFnForCall should be called once, for the call and not per // argument. This logic should exactly mirror LowerFormalArguments. CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC); - bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); + bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, + Outs[i].OrigTy, CCInfo); assert(!Res && "Call operand has unhandled type"); (void)Res; } } static SMECallAttrs -getSMECallAttrs(const Function &Caller, +getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI, const TargetLowering::CallLoweringInfo &CLI) { if (CLI.CB) - return SMECallAttrs(*CLI.CB); + return SMECallAttrs(*CLI.CB, &TLI); if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) - return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol())); + return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI)); return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal)); } @@ -8591,7 +8648,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // SME Streaming functions are not eligible for TCO as they may require // the streaming mode or ZA to be restored after returning from the call. - SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, CLI); + SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI); if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingAllZAState() || CallAttrs.caller().hasStreamingBody()) @@ -8834,8 +8891,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, - unsigned Condition, - SDValue PStateSM) const { + unsigned Condition) const { MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); FuncInfo->setHasStreamingModeChanges(true); @@ -8847,9 +8903,16 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL, SmallVector<SDValue> Ops = {Chain, MSROp}; unsigned Opcode; if (Condition != AArch64SME::Always) { + FuncInfo->setPStateSMRegUsed(true); + Register PStateReg = FuncInfo->getPStateSMReg(); + assert(PStateReg.isValid() && "PStateSM Register is invalid"); + SDValue PStateSM = + DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue); + // Use chain and glue from the CopyFromReg. + Ops[0] = PStateSM.getValue(1); + InGlue = PStateSM.getValue(2); SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64); Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP; - assert(PStateSM && "PStateSM should be defined"); Ops.push_back(ConditionOp); Ops.push_back(PStateSM); } else { @@ -8871,22 +8934,19 @@ static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); FuncInfo->setSMESaveBufferUsed(); - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Ty = PointerType::getUnqual(*DAG.getContext()); - Entry.Node = - DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64); - Args.push_back(Entry); - - SDValue Callee = - DAG.getExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore", - TLI.getPointerTy(DAG.getDataLayout())); + Args.emplace_back( + DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64), + PointerType::getUnqual(*DAG.getContext())); + + RTLIB::Libcall LC = + IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE; + SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), + TLI.getPointerTy(DAG.getDataLayout())); auto *RetTy = Type::getVoidTy(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( - CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1, RetTy, - Callee, std::move(Args)); + TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); return TLI.LowerCallTo(CLI).second; } @@ -9014,14 +9074,28 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, CallConv = CallingConv::AArch64_SVE_VectorCall; } + // Determine whether we need any streaming mode changes. + SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI); + bool UseNewSMEABILowering = getTM().useNewSMEABILowering(); + bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface(); + auto ZAMarkerNode = [&]() -> std::optional<unsigned> { + // TODO: Handle agnostic ZA functions. + if (!UseNewSMEABILowering || IsAgnosticZAFunction) + return std::nullopt; + if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State()) + return std::nullopt; + return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE + : AArch64ISD::INOUT_ZA_USE; + }(); + if (IsTailCall) { // Check if it's really possible to do a tail call. IsTailCall = isEligibleForTailCallOptimization(CLI); // A sibling call is one where we're under the usual C ABI and not planning // to change that but can still do a tail call: - if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail && - CallConv != CallingConv::SwiftTail) + if (!ZAMarkerNode && !TailCallOpt && IsTailCall && + CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail) IsSibCall = true; if (IsTailCall) @@ -9073,9 +9147,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); } - // Determine whether we need any streaming mode changes. - SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), CLI); - auto DescribeCallsite = [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & { R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '"; @@ -9089,7 +9160,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, return R; }; - bool RequiresLazySave = CallAttrs.requiresLazySave(); + bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave(); bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState(); if (RequiresLazySave) { const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); @@ -9124,15 +9195,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, /*IsSave=*/true); } - SDValue PStateSM; bool RequiresSMChange = CallAttrs.requiresSMChange(); if (RequiresSMChange) { - if (CallAttrs.caller().hasStreamingInterfaceOrBody()) - PStateSM = DAG.getConstant(1, DL, MVT::i64); - else if (CallAttrs.caller().hasNonStreamingInterface()) - PStateSM = DAG.getConstant(0, DL, MVT::i64); - else - PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64); OptimizationRemarkEmitter ORE(&MF.getFunction()); ORE.emit([&]() { auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition", @@ -9171,10 +9235,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32)); - // Adjust the stack pointer for the new arguments... + // Adjust the stack pointer for the new arguments... and mark ZA uses. // These operations are automatically eliminated by the prolog/epilog pass - if (!IsSibCall) + assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START"); + if (!IsSibCall) { Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL); + if (ZAMarkerNode) { + // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply + // using a chain can result in incorrect scheduling. The markers refer to + // the position just before the CALLSEQ_START (though occur after as + // CALLSEQ_START lacks in-glue). + Chain = DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other), + {Chain, Chain.getValue(1)}); + } + } SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy(DAG.getDataLayout())); @@ -9447,9 +9521,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, InGlue = Chain.getValue(1); } - SDValue NewChain = changeStreamingMode( - DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue, - getSMToggleCondition(CallAttrs), PStateSM); + SDValue NewChain = + changeStreamingMode(DAG, DL, CallAttrs.callee().hasStreamingInterface(), + Chain, InGlue, getSMToggleCondition(CallAttrs)); Chain = NewChain.getValue(0); InGlue = NewChain.getValue(1); } @@ -9633,10 +9707,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, InGlue = Result.getValue(Result->getNumValues() - 1); if (RequiresSMChange) { - assert(PStateSM && "Expected a PStateSM to be set"); Result = changeStreamingMode( DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue, - getSMToggleCondition(CallAttrs), PStateSM); + getSMToggleCondition(CallAttrs)); if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) { InGlue = Result.getValue(1); @@ -9646,7 +9719,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } } - if (CallAttrs.requiresEnablingZAAfterCall()) + if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall()) // Unconditionally resume ZA. Result = DAG.getNode( AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result, @@ -9659,15 +9732,15 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (RequiresLazySave) { // Conditionally restore the lazy save using a pseudo node. + RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE; TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); SDValue RegMask = DAG.getRegisterMask( - TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); + TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC))); SDValue RestoreRoutine = DAG.getTargetExternalSymbol( - "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout())); + getLibcallName(LC), getPointerTy(DAG.getDataLayout())); SDValue TPIDR2_EL0 = DAG.getNode( ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); - // Copy the address of the TPIDR2 block into X0 before 'calling' the // RESTORE_ZA pseudo. SDValue Glue; @@ -9679,7 +9752,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64), RestoreRoutine, RegMask, Result.getValue(1)}); - // Finally reset the TPIDR2_EL0 register to 0. Result = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Result, @@ -9802,14 +9874,11 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Emit SMSTOP before returning from a locally streaming function SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs(); if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) { - if (FuncAttrs.hasStreamingCompatibleInterface()) { - Register Reg = FuncInfo->getPStateSMReg(); - assert(Reg.isValid() && "PStateSM Register is invalid"); - SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64); + if (FuncAttrs.hasStreamingCompatibleInterface()) Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain, /*Glue*/ SDValue(), - AArch64SME::IfCallerIsNonStreaming, PStateSM); - } else + AArch64SME::IfCallerIsNonStreaming); + else Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain, /*Glue*/ SDValue(), AArch64SME::Always); Glue = Chain.getValue(1); @@ -17359,7 +17428,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, - ArrayRef<unsigned> Indices, unsigned Factor) const { + ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); @@ -17369,7 +17438,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad( auto *LI = dyn_cast<LoadInst>(Load); if (!LI) return false; - assert(!Mask && "Unexpected mask on a load"); + assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load"); const DataLayout &DL = LI->getDataLayout(); @@ -28194,6 +28263,7 @@ void AArch64TargetLowering::ReplaceNodeResults( case Intrinsic::aarch64_sme_in_streaming_mode: { SDLoc DL(N); SDValue Chain = DAG.getEntryNode(); + SDValue RuntimePStateSM = getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0)); Results.push_back( @@ -29004,7 +29074,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { // Checks to allow the use of SME instructions if (auto *Base = dyn_cast<CallBase>(&Inst)) { - auto CallAttrs = SMECallAttrs(*Base); + auto CallAttrs = SMECallAttrs(*Base, this); if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingZT0() || CallAttrs.requiresPreservingAllZAState()) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 8887657..071e96e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -23,6 +23,8 @@ namespace llvm { +class AArch64TargetMachine; + namespace AArch64 { /// Possible values of current rounding mode, which is specified in bits /// 23:22 of FPCR. @@ -64,6 +66,8 @@ public: explicit AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI); + const AArch64TargetMachine &getTM() const; + /// Control the following reassociation of operands: (op (op x, c1), y) -> (op /// (op x, y), c1) where N0 is (op x, c1) and N1 is y. bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, @@ -173,6 +177,10 @@ public: MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const; MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; + + // Note: The following group of functions are only used as part of the old SME + // ABI lowering. They will be removed once -aarch64-new-sme-abi=true is the + // default. MachineBasicBlock *EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitAllocateZABuffer(MachineInstr &MI, @@ -181,6 +189,8 @@ public: MachineBasicBlock *BB) const; MachineBasicBlock *EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitEntryPStateSM(MachineInstr &MI, + MachineBasicBlock *BB) const; /// Replace (0, vreg) discriminator components with the operands of blend /// or with (immediate, NoRegister) when possible. @@ -220,8 +230,8 @@ public: bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, - ArrayRef<unsigned> Indices, - unsigned Factor) const override; + ArrayRef<unsigned> Indices, unsigned Factor, + const APInt &GapMask) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor) const override; @@ -523,8 +533,8 @@ public: /// node. \p Condition should be one of the enum values from /// AArch64SME::ToggleCondition. SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, - SDValue Chain, SDValue InGlue, unsigned Condition, - SDValue PStateSM = SDValue()) const; + SDValue Chain, SDValue InGlue, + unsigned Condition) const; bool isVScaleKnownToBeAPowerOfTwo() const override { return true; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index d068a12..178dab6 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -36,7 +36,12 @@ def DestructiveBinary : DestructiveInstTypeEnum<5>; def DestructiveBinaryComm : DestructiveInstTypeEnum<6>; def DestructiveBinaryCommWithRev : DestructiveInstTypeEnum<7>; def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>; -def DestructiveUnaryPassthru : DestructiveInstTypeEnum<9>; + +// 3 inputs unpredicated (reg1, reg2, imm). +// Can be MOVPRFX'd iff reg1 == reg2. +def Destructive2xRegImmUnpred : DestructiveInstTypeEnum<9>; + +def DestructiveUnaryPassthru : DestructiveInstTypeEnum<10>; class FalseLanesEnum<bits<2> val> { bits<2> Value = val; @@ -7362,7 +7367,9 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm, [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; def v16i8 : BaseSIMDDifferentThreeVector<U, 0b001, opc, V128, V128, V128, - asm#"2", ".8h", ".16b", ".16b", []>; + asm#"2", ".8h", ".16b", ".16b", + [(set (v8i16 V128:$Rd), (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))), + (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm)))))]>; let Predicates = [HasAES] in { def v1i64 : BaseSIMDDifferentThreeVector<U, 0b110, opc, V128, V64, V64, @@ -7374,10 +7381,6 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm, [(set (v16i8 V128:$Rd), (OpNode (extract_high_v2i64 (v2i64 V128:$Rn)), (extract_high_v2i64 (v2i64 V128:$Rm))))]>; } - - def : Pat<(v8i16 (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))), - (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm))))), - (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>; } multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm, @@ -7402,87 +7405,7 @@ multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm, (extract_high_v4i32 (v4i32 V128:$Rm))))]>; } -multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm, - SDPatternOperator OpNode = null_frag> { - def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc, - V128, V64, V64, - asm, ".8h", ".8b", ".8b", - [(set (v8i16 V128:$Rd), - (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>; - def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc, - V128, V128, V128, - asm#"2", ".8h", ".16b", ".16b", - [(set (v8i16 V128:$Rd), - (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)), - (extract_high_v16i8 (v16i8 V128:$Rm))))))]>; - def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc, - V128, V64, V64, - asm, ".4s", ".4h", ".4h", - [(set (v4i32 V128:$Rd), - (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>; - def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc, - V128, V128, V128, - asm#"2", ".4s", ".8h", ".8h", - [(set (v4i32 V128:$Rd), - (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), - (extract_high_v8i16 (v8i16 V128:$Rm))))))]>; - def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc, - V128, V64, V64, - asm, ".2d", ".2s", ".2s", - [(set (v2i64 V128:$Rd), - (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>; - def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc, - V128, V128, V128, - asm#"2", ".2d", ".4s", ".4s", - [(set (v2i64 V128:$Rd), - (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), - (extract_high_v4i32 (v4i32 V128:$Rm))))))]>; -} - -multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc, - string asm, - SDPatternOperator OpNode> { - def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc, - V128, V64, V64, - asm, ".8h", ".8b", ".8b", - [(set (v8i16 V128:$dst), - (add (v8i16 V128:$Rd), - (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>; - def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc, - V128, V128, V128, - asm#"2", ".8h", ".16b", ".16b", - [(set (v8i16 V128:$dst), - (add (v8i16 V128:$Rd), - (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)), - (extract_high_v16i8 (v16i8 V128:$Rm)))))))]>; - def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc, - V128, V64, V64, - asm, ".4s", ".4h", ".4h", - [(set (v4i32 V128:$dst), - (add (v4i32 V128:$Rd), - (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>; - def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc, - V128, V128, V128, - asm#"2", ".4s", ".8h", ".8h", - [(set (v4i32 V128:$dst), - (add (v4i32 V128:$Rd), - (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), - (extract_high_v8i16 (v8i16 V128:$Rm)))))))]>; - def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc, - V128, V64, V64, - asm, ".2d", ".2s", ".2s", - [(set (v2i64 V128:$dst), - (add (v2i64 V128:$Rd), - (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>; - def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc, - V128, V128, V128, - asm#"2", ".2d", ".4s", ".4s", - [(set (v2i64 V128:$dst), - (add (v2i64 V128:$Rd), - (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), - (extract_high_v4i32 (v4i32 V128:$Rm)))))))]>; -} - +let isCommutable = 1 in multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm, SDPatternOperator OpNode = null_frag> { def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index fb59c9f..d15f90d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -20,7 +20,9 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -83,6 +85,11 @@ static cl::opt<unsigned> BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)")); +static cl::opt<unsigned> GatherOptSearchLimit( + "aarch64-search-limit", cl::Hidden, cl::init(2048), + cl::desc("Restrict range of instructions to search for the " + "machine-combiner gather pattern optimization")); + AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, AArch64::CATCHRET), @@ -5078,8 +5085,13 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. MCRegister DestRegX = TRI->getMatchingSuperReg( DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); - MCRegister SrcRegX = TRI->getMatchingSuperReg( - SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); + assert(DestRegX.isValid() && "Destination super-reg not valid"); + MCRegister SrcRegX = + SrcReg == AArch64::WZR + ? AArch64::XZR + : TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32, + &AArch64::GPR64spRegClass); + assert(SrcRegX.isValid() && "Source super-reg not valid"); // This instruction is reading and writing X registers. This may upset // the register scavenger and machine verifier, so we need to indicate // that we are reading an undefined value from SrcRegX, but a proper @@ -5920,7 +5932,7 @@ static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes) SmallString<64> Expr; unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); - assert(DwarfReg >= 0 && DwarfReg <= 31 && "DwarfReg out of bounds (0..31)"); + assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)"); // Reg + NumBytes Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg); appendLEB128<LEB128Sign::Signed>(Expr, NumBytes); @@ -7412,11 +7424,319 @@ static bool getMiscPatterns(MachineInstr &Root, return false; } +/// Check if the given instruction forms a gather load pattern that can be +/// optimized for better Memory-Level Parallelism (MLP). This function +/// identifies chains of NEON lane load instructions that load data from +/// different memory addresses into individual lanes of a 128-bit vector +/// register, then attempts to split the pattern into parallel loads to break +/// the serial dependency between instructions. +/// +/// Pattern Matched: +/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) -> +/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root) +/// +/// Transformed Into: +/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64 +/// to combine the results, enabling better memory-level parallelism. +/// +/// Supported Element Types: +/// - 32-bit elements (LD1i32, 4 lanes total) +/// - 16-bit elements (LD1i16, 8 lanes total) +/// - 8-bit elements (LD1i8, 16 lanes total) +static bool getGatherLanePattern(MachineInstr &Root, + SmallVectorImpl<unsigned> &Patterns, + unsigned LoadLaneOpCode, unsigned NumLanes) { + const MachineFunction *MF = Root.getMF(); + + // Early exit if optimizing for size. + if (MF->getFunction().hasMinSize()) + return false; + + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + + // The root of the pattern must load into the last lane of the vector. + if (Root.getOperand(2).getImm() != NumLanes - 1) + return false; + + // Check that we have load into all lanes except lane 0. + // For each load we also want to check that: + // 1. It has a single non-debug use (since we will be replacing the virtual + // register) + // 2. That the addressing mode only uses a single pointer operand + auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + auto Range = llvm::seq<unsigned>(1, NumLanes - 1); + SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end()); + SmallVector<const MachineInstr *, 16> LoadInstrs; + while (!RemainingLanes.empty() && CurrInstr && + CurrInstr->getOpcode() == LoadLaneOpCode && + MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) && + CurrInstr->getNumOperands() == 4) { + RemainingLanes.erase(CurrInstr->getOperand(2).getImm()); + LoadInstrs.push_back(CurrInstr); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + // Check that we have found a match for lanes N-1.. 1. + if (!RemainingLanes.empty()) + return false; + + // Match the SUBREG_TO_REG sequence. + if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG) + return false; + + // Verify that the subreg to reg loads an integer into the first lane. + auto Lane0LoadReg = CurrInstr->getOperand(2).getReg(); + unsigned SingleLaneSizeInBits = 128 / NumLanes; + if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits) + return false; + + // Verify that it also has a single non debug use. + if (!MRI.hasOneNonDBGUse(Lane0LoadReg)) + return false; + + LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg)); + + // If there is any chance of aliasing, do not apply the pattern. + // Walk backward through the MBB starting from Root. + // Exit early if we've encountered all load instructions or hit the search + // limit. + auto MBBItr = Root.getIterator(); + unsigned RemainingSteps = GatherOptSearchLimit; + SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs; + RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end()); + const MachineBasicBlock *MBB = Root.getParent(); + + for (; MBBItr != MBB->begin() && RemainingSteps > 0 && + !RemainingLoadInstrs.empty(); + --MBBItr, --RemainingSteps) { + const MachineInstr &CurrInstr = *MBBItr; + + // Remove this instruction from remaining loads if it's one we're tracking. + RemainingLoadInstrs.erase(&CurrInstr); + + // Check for potential aliasing with any of the load instructions to + // optimize. + if (CurrInstr.isLoadFoldBarrier()) + return false; + } + + // If we hit the search limit without finding all load instructions, + // don't match the pattern. + if (RemainingSteps == 0 && !RemainingLoadInstrs.empty()) + return false; + + switch (NumLanes) { + case 4: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32); + break; + case 8: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16); + break; + case 16: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8); + break; + default: + llvm_unreachable("Got bad number of lanes for gather pattern."); + } + + return true; +} + +/// Search for patterns of LD instructions we can optimize. +static bool getLoadPatterns(MachineInstr &Root, + SmallVectorImpl<unsigned> &Patterns) { + + // The pattern searches for loads into single lanes. + switch (Root.getOpcode()) { + case AArch64::LD1i32: + return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4); + case AArch64::LD1i16: + return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8); + case AArch64::LD1i8: + return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16); + default: + return false; + } +} + +/// Generate optimized instruction sequence for gather load patterns to improve +/// Memory-Level Parallelism (MLP). This function transforms a chain of +/// sequential NEON lane loads into parallel vector loads that can execute +/// concurrently. +static void +generateGatherLanePattern(MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<Register, unsigned> &InstrIdxForVirtReg, + unsigned Pattern, unsigned NumLanes) { + MachineFunction &MF = *Root.getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + // Gather the initial load instructions to build the pattern. + SmallVector<MachineInstr *, 16> LoadToLaneInstrs; + MachineInstr *CurrInstr = &Root; + for (unsigned i = 0; i < NumLanes - 1; ++i) { + LoadToLaneInstrs.push_back(CurrInstr); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + // Sort the load instructions according to the lane. + llvm::sort(LoadToLaneInstrs, + [](const MachineInstr *A, const MachineInstr *B) { + return A->getOperand(2).getImm() > B->getOperand(2).getImm(); + }); + + MachineInstr *SubregToReg = CurrInstr; + LoadToLaneInstrs.push_back( + MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg())); + auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs); + + const TargetRegisterClass *FPR128RegClass = + MRI.getRegClass(Root.getOperand(0).getReg()); + + // Helper lambda to create a LD1 instruction. + auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr, + Register SrcRegister, unsigned Lane, + Register OffsetRegister, + bool OffsetRegisterKillState) { + auto NewRegister = MRI.createVirtualRegister(FPR128RegClass); + MachineInstrBuilder LoadIndexIntoRegister = + BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()), + NewRegister) + .addReg(SrcRegister) + .addImm(Lane) + .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState)); + InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size())); + InsInstrs.push_back(LoadIndexIntoRegister); + return NewRegister; + }; + + // Helper to create load instruction based on the NumLanes in the NEON + // register we are rewriting. + auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg, + Register OffsetReg, + bool KillState) -> MachineInstrBuilder { + unsigned Opcode; + switch (NumLanes) { + case 4: + Opcode = AArch64::LDRSui; + break; + case 8: + Opcode = AArch64::LDRHui; + break; + case 16: + Opcode = AArch64::LDRBui; + break; + default: + llvm_unreachable( + "Got unsupported number of lanes in machine-combiner gather pattern"); + } + // Immediate offset load + return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg) + .addReg(OffsetReg) + .addImm(0); + }; + + // Load the remaining lanes into register 0. + auto LanesToLoadToReg0 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + 1, + LoadToLaneInstrsAscending.begin() + NumLanes / 2); + Register PrevReg = SubregToReg->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) { + const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3); + PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1, + OffsetRegOperand.getReg(), + OffsetRegOperand.isKill()); + DelInstrs.push_back(LoadInstr); + } + Register LastLoadReg0 = PrevReg; + + // First load into register 1. Perform an integer load to zero out the upper + // lanes in a single instruction. + MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin(); + MachineInstr *OriginalSplitLoad = + *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2); + Register DestRegForMiddleIndex = MRI.createVirtualRegister( + MRI.getRegClass(Lane0Load->getOperand(0).getReg())); + + const MachineOperand &OriginalSplitToLoadOffsetOperand = + OriginalSplitLoad->getOperand(3); + MachineInstrBuilder MiddleIndexLoadInstr = + CreateLDRInstruction(NumLanes, DestRegForMiddleIndex, + OriginalSplitToLoadOffsetOperand.getReg(), + OriginalSplitToLoadOffsetOperand.isKill()); + + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForMiddleIndex, InsInstrs.size())); + InsInstrs.push_back(MiddleIndexLoadInstr); + DelInstrs.push_back(OriginalSplitLoad); + + // Subreg To Reg instruction for register 1. + Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass); + unsigned SubregType; + switch (NumLanes) { + case 4: + SubregType = AArch64::ssub; + break; + case 8: + SubregType = AArch64::hsub; + break; + case 16: + SubregType = AArch64::bsub; + break; + default: + llvm_unreachable( + "Got invalid NumLanes for machine-combiner gather pattern"); + } + + auto SubRegToRegInstr = + BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()), + DestRegForSubregToReg) + .addImm(0) + .addReg(DestRegForMiddleIndex, getKillRegState(true)) + .addImm(SubregType); + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForSubregToReg, InsInstrs.size())); + InsInstrs.push_back(SubRegToRegInstr); + + // Load remaining lanes into register 1. + auto LanesToLoadToReg1 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, + LoadToLaneInstrsAscending.end()); + PrevReg = SubRegToRegInstr->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) { + const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3); + PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1, + OffsetRegOperand.getReg(), + OffsetRegOperand.isKill()); + + // Do not add the last reg to DelInstrs - it will be removed later. + if (Index == NumLanes / 2 - 2) { + break; + } + DelInstrs.push_back(LoadInstr); + } + Register LastLoadReg1 = PrevReg; + + // Create the final zip instruction to combine the results. + MachineInstrBuilder ZipInstr = + BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64), + Root.getOperand(0).getReg()) + .addReg(LastLoadReg0) + .addReg(LastLoadReg1); + InsInstrs.push_back(ZipInstr); +} + CombinerObjective AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const { switch (Pattern) { case AArch64MachineCombinerPattern::SUBADD_OP1: case AArch64MachineCombinerPattern::SUBADD_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return CombinerObjective::MustReduceDepth; default: return TargetInstrInfo::getCombinerObjective(Pattern); @@ -7446,6 +7766,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( if (getMiscPatterns(Root, Patterns)) return true; + // Load patterns + if (getLoadPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce); } @@ -8701,6 +9025,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); break; } + case AArch64MachineCombinerPattern::GATHER_LANE_i32: { + generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 4); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i16: { + generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 8); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i8: { + generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 16); + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 7c255da..70c814a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned { FMULv8i16_indexed_OP2, FNMADD, + + GATHER_LANE_i32, + GATHER_LANE_i16, + GATHER_LANE_i8 }; class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; @@ -820,7 +824,8 @@ enum DestructiveInstType { DestructiveBinaryComm = TSFLAG_DESTRUCTIVE_INST_TYPE(0x6), DestructiveBinaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x7), DestructiveTernaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x8), - DestructiveUnaryPassthru = TSFLAG_DESTRUCTIVE_INST_TYPE(0x9), + Destructive2xRegImmUnpred = TSFLAG_DESTRUCTIVE_INST_TYPE(0x9), + DestructiveUnaryPassthru = TSFLAG_DESTRUCTIVE_INST_TYPE(0xa), }; enum FalseLaneType { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ac31236..4fa91a4 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5707,27 +5707,6 @@ let Predicates = [HasFullFP16] in { // Advanced SIMD two vector instructions. //===----------------------------------------------------------------------===// -defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", abdu>; -// Match UABDL in log2-shuffle patterns. -def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)), - (zext (v8i8 V64:$opB))))), - (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; -def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))), - (zext (extract_high_v16i8 (v16i8 V128:$opB)))))), - (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; -def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)), - (zext (v4i16 V64:$opB))))), - (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>; -def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 (v8i16 V128:$opA))), - (zext (extract_high_v8i16 (v8i16 V128:$opB)))))), - (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>; -def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)), - (zext (v2i32 V64:$opB))))), - (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>; -def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 (v4i32 V128:$opA))), - (zext (extract_high_v4i32 (v4i32 V128:$opB)))))), - (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>; - defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>; defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>; defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>; @@ -6055,6 +6034,7 @@ defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>; defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>; defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>; +let isCommutable = 1 in defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>; defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", TriOpFrag<(add node:$LHS, (abds node:$MHS, node:$RHS))> >; @@ -6802,40 +6782,47 @@ def : Pat <(f64 (uint_to_fp (i32 // Advanced SIMD three different-sized vector instructions. //===----------------------------------------------------------------------===// -defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>; -defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>; -defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>; -defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>; -defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>; -defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>; -defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>; +defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>; +defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>; +defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>; +defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>; +let isCommutable = 1 in +defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>; +defm SABAL : SIMDLongThreeVectorTiedBHS<0,0b0101,"sabal", + TriOpFrag<(add node:$LHS, (zext (abds node:$MHS, node:$RHS)))>>; +defm SABDL : SIMDLongThreeVectorBHS<0, 0b0111, "sabdl", + BinOpFrag<(zext (abds node:$LHS, node:$RHS))>>; defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl", - BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>; + BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>; defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw", BinOpFrag<(add node:$LHS, (sext node:$RHS))>>; defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal", - TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; + TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl", - TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; + TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", AArch64smull>; defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", saddsat>; defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", ssubsat>; -defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull", - int_aarch64_neon_sqdmull>; +defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull", int_aarch64_neon_sqdmull>; +let isCommutable = 0 in defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl", BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>; defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>; -defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", abdu>; +defm UABAL : SIMDLongThreeVectorTiedBHS<1, 0b0101, "uabal", + TriOpFrag<(add node:$LHS, (zext (abdu node:$MHS, node:$RHS)))>>; +defm UABDL : SIMDLongThreeVectorBHS<1, 0b0111, "uabdl", + BinOpFrag<(zext (abdu node:$LHS, node:$RHS))>>; defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>; defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>; defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal", - TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; + TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl", - TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; + TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>; +let isCommutable = 0 in defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl", BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>; defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw", diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 782d62a7..e69fa32 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -1193,7 +1193,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // USE kill %w1 ; need to clear kill flag when moving STRWui downwards // STRW %w0 Register Reg = getLdStRegOp(*I).getReg(); - for (MachineInstr &MI : make_range(std::next(I), Paired)) + for (MachineInstr &MI : + make_range(std::next(I->getIterator()), Paired->getIterator())) MI.clearRegisterKills(Reg, TRI); } } diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 800787c..ed3374a 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -213,9 +213,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// or return type bool IsSVECC = false; - /// The frame-index for the TPIDR2 object used for lazy saves. - TPIDR2Object TPIDR2; - /// Whether this function changes streaming mode within the function. bool HasStreamingModeChanges = false; @@ -231,13 +228,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { // on function entry to record the initial pstate of a function. Register PStateSMReg = MCRegister::NoRegister; - // Holds a pointer to a buffer that is large enough to represent - // all SME ZA state and any additional state required by the - // __arm_sme_save/restore support routines. - Register SMESaveBufferAddr = MCRegister::NoRegister; - - // true if SMESaveBufferAddr is used. - bool SMESaveBufferUsed = false; + // true if PStateSMReg is used. + bool PStateSMRegUsed = false; // Has the PNReg used to build PTRUE instruction. // The PTRUE is used for the LD/ST of ZReg pairs in save and restore. @@ -250,6 +242,16 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { // Holds the SME function attributes (streaming mode, ZA/ZT0 state). SMEAttrs SMEFnAttrs; + // Note: The following properties are only used for the old SME ABI lowering: + /// The frame-index for the TPIDR2 object used for lazy saves. + TPIDR2Object TPIDR2; + // Holds a pointer to a buffer that is large enough to represent + // all SME ZA state and any additional state required by the + // __arm_sme_save/restore support routines. + Register SMESaveBufferAddr = MCRegister::NoRegister; + // true if SMESaveBufferAddr is used. + bool SMESaveBufferUsed = false; + public: AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI); @@ -258,6 +260,13 @@ public: const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB) const override; + // Old SME ABI lowering state getters/setters: + Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; }; + void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; }; + unsigned isSMESaveBufferUsed() const { return SMESaveBufferUsed; }; + void setSMESaveBufferUsed(bool Used = true) { SMESaveBufferUsed = Used; }; + TPIDR2Object &getTPIDR2Obj() { return TPIDR2; } + void setPredicateRegForFillSpill(unsigned Reg) { PredicateRegForFillSpill = Reg; } @@ -265,15 +274,12 @@ public: return PredicateRegForFillSpill; } - Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; }; - void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; }; - - unsigned isSMESaveBufferUsed() const { return SMESaveBufferUsed; }; - void setSMESaveBufferUsed(bool Used = true) { SMESaveBufferUsed = Used; }; - Register getPStateSMReg() const { return PStateSMReg; }; void setPStateSMReg(Register Reg) { PStateSMReg = Reg; }; + unsigned isPStateSMRegUsed() const { return PStateSMRegUsed; }; + void setPStateSMRegUsed(bool Used = true) { PStateSMRegUsed = Used; }; + int64_t getVGIdx() const { return VGIdx; }; void setVGIdx(unsigned Idx) { VGIdx = Idx; }; @@ -283,8 +289,6 @@ public: bool isSVECC() const { return IsSVECC; }; void setIsSVECC(bool s) { IsSVECC = s; }; - TPIDR2Object &getTPIDR2Obj() { return TPIDR2; } - void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI); unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; } diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 1bc1d98..42eaeca 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -321,7 +321,6 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7", FeatureFuseAES, FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCRegMoveFPR64, FeatureZCZeroing, FeatureZCZeroingFPWorkaround]>; @@ -335,7 +334,6 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10", FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", @@ -348,7 +346,6 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", @@ -361,7 +358,6 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", @@ -374,7 +370,6 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", @@ -392,7 +387,6 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", @@ -410,7 +404,6 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", @@ -428,7 +421,6 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", @@ -446,7 +438,6 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", @@ -463,7 +454,6 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", FeatureFuseCryptoEOR, FeatureFuseLiterals, FeatureZCRegMoveGPR64, - FeatureZCRegMoveFPR64, FeatureZCZeroing ]>; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index db27ca9..5c4e0c1 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -39,12 +39,25 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2, def AArch64CoalescerBarrier : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>; +def AArch64EntryPStateSM + : SDNode<"AArch64ISD::ENTRY_PSTATE_SM", SDTypeProfile<1, 0, + [SDTCisInt<0>]>, [SDNPHasChain, SDNPSideEffect]>; + +let usesCustomInserter = 1 in { + def EntryPStateSM : Pseudo<(outs GPR64:$is_streaming), (ins), []>, Sched<[]> {} +} +def : Pat<(i64 (AArch64EntryPStateSM)), (EntryPStateSM)>; + def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>, [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>, [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; +//===----------------------------------------------------------------------===// +// Old SME ABI lowering ISD nodes/pseudos (deprecated) +//===----------------------------------------------------------------------===// + def AArch64AllocateZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>, [SDNPHasChain, SDNPSideEffect]>; @@ -78,6 +91,30 @@ def : Pat<(i64 (AArch64AllocateSMESaveBuffer GPR64:$size)), (AllocateSMESaveBuffer $size)>; //===----------------------------------------------------------------------===// +// New SME ABI lowering ISD nodes/pseudos (-aarch64-new-sme-abi) +//===----------------------------------------------------------------------===// + +let hasSideEffects = 1, isMeta = 1 in { + def InOutZAUsePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; + def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; +} + +def CommitZASavePseudo + : Pseudo<(outs), + (ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>, + Sched<[]>; + +def AArch64_inout_za_use + : SDNode<"AArch64ISD::INOUT_ZA_USE", SDTypeProfile<0, 0,[]>, + [SDNPHasChain, SDNPInGlue]>; +def : Pat<(AArch64_inout_za_use), (InOutZAUsePseudo)>; + +def AArch64_requires_za_save + : SDNode<"AArch64ISD::REQUIRES_ZA_SAVE", SDTypeProfile<0, 0,[]>, + [SDNPHasChain, SDNPInGlue]>; +def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>; + +//===----------------------------------------------------------------------===// // Instruction naming conventions. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 0c4b4f4..509dd8b 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1021,7 +1021,9 @@ let Predicates = [HasNonStreamingSVE_or_SME2p2] in { let Predicates = [HasSVE_or_SME] in { defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>; defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>; - defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>; + defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext, "EXT_ZZI_CONSTRUCTIVE">; + + def EXT_ZZI_CONSTRUCTIVE : UnpredRegImmPseudo<ZPR8, imm0_255>; defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", AArch64rbit_mt>; defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", AArch64revb_mt>; @@ -2131,21 +2133,37 @@ let Predicates = [HasSVE_or_SME] in { (LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>; // Splice with lane bigger or equal to 0 - foreach VT = [nxv16i8] in + foreach VT = [nxv16i8] in { def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_255 i32:$index)))), (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + let AddedComplexity = 1 in + def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_255 i32:$index)))), + (EXT_ZZI_CONSTRUCTIVE ZPR:$Z1, imm0_255:$index)>; + } - foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in + foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in { def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_127 i32:$index)))), (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + let AddedComplexity = 1 in + def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_127 i32:$index)))), + (EXT_ZZI_CONSTRUCTIVE ZPR:$Z1, imm0_255:$index)>; + } - foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in + foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in { def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_63 i32:$index)))), (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + let AddedComplexity = 1 in + def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_63 i32:$index)))), + (EXT_ZZI_CONSTRUCTIVE ZPR:$Z1, imm0_255:$index)>; + } - foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in + foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in { def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_31 i32:$index)))), (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + let AddedComplexity = 1 in + def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_31 i32:$index)))), + (EXT_ZZI_CONSTRUCTIVE ZPR:$Z1, imm0_255:$index)>; + } defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>; defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedA320.td b/llvm/lib/Target/AArch64/AArch64SchedA320.td index 89ed1338..5ec95c7 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA320.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA320.td @@ -847,7 +847,7 @@ def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^[SU]XTB_ZPmZ "^[SU]XTW_ZPmZ_[D]")>; // Extract -def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_B)>; +def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>; // Extract narrow saturating def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]", diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td b/llvm/lib/Target/AArch64/AArch64SchedA510.td index 9456878..356e3fa 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA510.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td @@ -825,7 +825,7 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]XTB_ZPmZ "^[SU]XTW_ZPmZ_[D]")>; // Extract -def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_B)>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>; // Extract narrow saturating def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]", @@ -1016,7 +1016,7 @@ def : InstRW<[CortexA510MCWrite<16, 13, CortexA510UnitVALU>], (instrs FADDA_VPZ_ def : InstRW<[CortexA510MCWrite<8, 5, CortexA510UnitVALU>], (instrs FADDA_VPZ_D)>; // Floating point compare -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]", +def : InstRW<[CortexA510MCWrite<4, 2, CortexA510UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]", "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]", "^FCM(LE|LT)_PPzZ0_[HSD]", "^FCMUO_PPzZZ_[HSD]")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td index 91a7079..e798222 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td @@ -1785,7 +1785,7 @@ def : InstRW<[N2Write_2c_1V1], (instregex "^[SU]XTB_ZPmZ_[HSD]", "^[SU]XTW_ZPmZ_[D]")>; // Extract -def : InstRW<[N2Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_B)>; +def : InstRW<[N2Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>; // Extract narrow saturating def : InstRW<[N2Write_4c_1V1], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]$", diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td index ecfb124..e44d40f 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td @@ -1757,7 +1757,7 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU]XTB_ZPmZ_[HSD]", "^[SU]XTW_ZPmZ_[D]")>; // Extract -def : InstRW<[N3Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_B)>; +def : InstRW<[N3Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>; // Extract narrow saturating def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]$", diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td index 3686654..44625a2 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td @@ -1575,7 +1575,7 @@ def : InstRW<[V1Write_2c_1V1], (instregex "^[SU]XTB_ZPmZ_[HSD]", "^[SU]XTW_ZPmZ_[D]")>; // Extract -def : InstRW<[V1Write_2c_1V01], (instrs EXT_ZZI)>; +def : InstRW<[V1Write_2c_1V01], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE)>; // Extract/insert operation, SIMD and FP scalar form def : InstRW<[V1Write_3c_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]$", diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td index b2c3da0..6261220 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td @@ -2272,7 +2272,7 @@ def : InstRW<[V2Write_2c_1V13], (instregex "^[SU]XTB_ZPmZ_[HSD]", "^[SU]XTW_ZPmZ_[D]")>; // Extract -def : InstRW<[V2Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_B)>; +def : InstRW<[V2Write_2c_1V], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>; // Extract narrow saturating def : InstRW<[V2Write_4c_1V13], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]", diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 8a5b5ba..d3b1aa6 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -182,37 +182,25 @@ SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall( const AArch64Subtarget &STI = DAG.getMachineFunction().getSubtarget<AArch64Subtarget>(); const AArch64TargetLowering *TLI = STI.getTargetLowering(); - TargetLowering::ArgListEntry DstEntry; - DstEntry.Ty = PointerType::getUnqual(*DAG.getContext()); - DstEntry.Node = Dst; TargetLowering::ArgListTy Args; - Args.push_back(DstEntry); + Args.emplace_back(Dst, PointerType::getUnqual(*DAG.getContext())); RTLIB::Libcall NewLC; switch (LC) { case RTLIB::MEMCPY: { NewLC = RTLIB::SC_MEMCPY; - TargetLowering::ArgListEntry Entry; - Entry.Ty = PointerType::getUnqual(*DAG.getContext()); - Entry.Node = Src; - Args.push_back(Entry); + Args.emplace_back(Src, PointerType::getUnqual(*DAG.getContext())); break; } case RTLIB::MEMMOVE: { NewLC = RTLIB::SC_MEMMOVE; - TargetLowering::ArgListEntry Entry; - Entry.Ty = PointerType::getUnqual(*DAG.getContext()); - Entry.Node = Src; - Args.push_back(Entry); + Args.emplace_back(Src, PointerType::getUnqual(*DAG.getContext())); break; } case RTLIB::MEMSET: { NewLC = RTLIB::SC_MEMSET; - TargetLowering::ArgListEntry Entry; - Entry.Ty = Type::getInt32Ty(*DAG.getContext()); - Src = DAG.getZExtOrTrunc(Src, DL, MVT::i32); - Entry.Node = Src; - Args.push_back(Entry); + Args.emplace_back(DAG.getZExtOrTrunc(Src, DL, MVT::i32), + Type::getInt32Ty(*DAG.getContext())); break; } default: @@ -221,10 +209,7 @@ SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall( EVT PointerVT = TLI->getPointerTy(DAG.getDataLayout()); SDValue Symbol = DAG.getExternalSymbol(TLI->getLibcallName(NewLC), PointerVT); - TargetLowering::ArgListEntry SizeEntry; - SizeEntry.Node = Size; - SizeEntry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); - Args.push_back(SizeEntry); + Args.emplace_back(Size, DAG.getDataLayout().getIntPtrType(*DAG.getContext())); TargetLowering::CallLoweringInfo CLI(DAG); PointerType *RetTy = PointerType::getUnqual(*DAG.getContext()); diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index f136a184..a67bd42 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -585,8 +585,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { ClMaxLifetimes); if (StandardLifetime) { IntrinsicInst *Start = Info.LifetimeStart[0]; - uint64_t Size = - cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue(); + uint64_t Size = *Info.AI->getAllocationSize(*DL); Size = alignTo(Size, kTagGranuleSize); tagAlloca(AI, Start->getNextNode(), TagPCall, Size); diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 95eab16..e67bd58 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -224,6 +224,11 @@ static cl::opt<bool> cl::desc("Enable Machine Pipeliner for AArch64"), cl::init(false), cl::Hidden); +static cl::opt<bool> + EnableNewSMEABILowering("aarch64-new-sme-abi", + cl::desc("Enable new lowering for the SME ABI"), + cl::init(false), cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { // Register the target. @@ -263,6 +268,7 @@ LLVMInitializeAArch64Target() { initializeLDTLSCleanupPass(PR); initializeKCFIPass(PR); initializeSMEABIPass(PR); + initializeMachineSMEABIPass(PR); initializeSMEPeepholeOptPass(PR); initializeSVEIntrinsicOptsPass(PR); initializeAArch64SpeculationHardeningPass(PR); @@ -367,7 +373,8 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, computeDefaultCPU(TT, CPU), FS, Options, getEffectiveRelocModel(TT, RM), getEffectiveAArch64CodeModel(TT, CM, JIT), OL), - TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) { + TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian), + UseNewSMEABILowering(EnableNewSMEABILowering) { initAsmInfo(); if (TT.isOSBinFormatMachO()) { @@ -668,10 +675,12 @@ void AArch64PassConfig::addIRPasses() { addPass(createInterleavedAccessPass()); } - // Expand any functions marked with SME attributes which require special - // changes for the calling convention or that require the lazy-saving - // mechanism specified in the SME ABI. - addPass(createSMEABIPass()); + if (!EnableNewSMEABILowering) { + // Expand any functions marked with SME attributes which require special + // changes for the calling convention or that require the lazy-saving + // mechanism specified in the SME ABI. + addPass(createSMEABIPass()); + } // Add Control Flow Guard checks. if (TM->getTargetTriple().isOSWindows()) { @@ -782,6 +791,9 @@ bool AArch64PassConfig::addGlobalInstructionSelect() { } void AArch64PassConfig::addMachineSSAOptimization() { + if (EnableNewSMEABILowering && TM->getOptLevel() != CodeGenOptLevel::None) + addPass(createMachineSMEABIPass()); + if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt) addPass(createSMEPeepholeOptPass()); @@ -812,6 +824,9 @@ bool AArch64PassConfig::addILPOpts() { } void AArch64PassConfig::addPreRegAlloc() { + if (TM->getOptLevel() == CodeGenOptLevel::None && EnableNewSMEABILowering) + addPass(createMachineSMEABIPass()); + // Change dead register definitions to refer to the zero register. if (TM->getOptLevel() != CodeGenOptLevel::None && EnableDeadRegisterElimination) diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h index b9e522d..0dd5d95 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -79,8 +79,12 @@ public: size_t clearLinkerOptimizationHints( const SmallPtrSetImpl<MachineInstr *> &MIs) const override; + /// Returns true if the new SME ABI lowering should be used. + bool useNewSMEABILowering() const { return UseNewSMEABILowering; } + private: bool isLittle; + bool UseNewSMEABILowering; }; // AArch64 little endian target machine. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 9f05add..b021968 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -220,20 +220,17 @@ static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode( static cl::opt<bool> EnableScalableAutovecInStreamingMode( "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden); -static bool isSMEABIRoutineCall(const CallInst &CI) { +static bool isSMEABIRoutineCall(const CallInst &CI, + const AArch64TargetLowering &TLI) { const auto *F = CI.getCalledFunction(); - return F && StringSwitch<bool>(F->getName()) - .Case("__arm_sme_state", true) - .Case("__arm_tpidr2_save", true) - .Case("__arm_tpidr2_restore", true) - .Case("__arm_za_disable", true) - .Default(false); + return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine(); } /// Returns true if the function has explicit operations that can only be /// lowered using incompatible instructions for the selected mode. This also /// returns true if the function F may use or modify ZA state. -static bool hasPossibleIncompatibleOps(const Function *F) { +static bool hasPossibleIncompatibleOps(const Function *F, + const AArch64TargetLowering &TLI) { for (const BasicBlock &BB : *F) { for (const Instruction &I : BB) { // Be conservative for now and assume that any call to inline asm or to @@ -242,7 +239,7 @@ static bool hasPossibleIncompatibleOps(const Function *F) { // all native LLVM instructions can be lowered to compatible instructions. if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() && (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) || - isSMEABIRoutineCall(cast<CallInst>(I)))) + isSMEABIRoutineCall(cast<CallInst>(I), TLI))) return true; } } @@ -290,7 +287,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() || CallAttrs.requiresPreservingZT0() || CallAttrs.requiresPreservingAllZAState()) { - if (hasPossibleIncompatibleOps(Callee)) + if (hasPossibleIncompatibleOps(Callee, *getTLI())) return false; } @@ -357,7 +354,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, // change only once and avoid inlining of G into F. SMEAttrs FAttrs(*F); - SMECallAttrs CallAttrs(Call); + SMECallAttrs CallAttrs(Call, getTLI()); if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) { if (F == Call.getCaller()) // (1) @@ -554,7 +551,17 @@ static bool isUnpackedVectorVT(EVT VecVT) { VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; } -static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { +static InstructionCost getHistogramCost(const AArch64Subtarget *ST, + const IntrinsicCostAttributes &ICA) { + // We need to know at least the number of elements in the vector of buckets + // and the size of each element to update. + if (ICA.getArgTypes().size() < 2) + return InstructionCost::getInvalid(); + + // Only interested in costing for the hardware instruction from SVE2. + if (!ST->hasSVE2()) + return InstructionCost::getInvalid(); + Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements unsigned TotalHistCnts = 1; @@ -579,9 +586,11 @@ static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize; TotalHistCnts = EC / NaturalVectorWidth; + + return InstructionCost(BaseHistCntCost * TotalHistCnts); } - return InstructionCost(BaseHistCntCost * TotalHistCnts); + return InstructionCost::getInvalid(); } InstructionCost @@ -597,10 +606,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return InstructionCost::getInvalid(); switch (ICA.getID()) { - case Intrinsic::experimental_vector_histogram_add: - if (!ST->hasSVE2()) - return InstructionCost::getInvalid(); - return getHistogramCost(ICA); + case Intrinsic::experimental_vector_histogram_add: { + InstructionCost HistCost = getHistogramCost(ST, ICA); + // If the cost isn't valid, we may still be able to scalarize + if (HistCost.isValid()) + return HistCost; + break; + } case Intrinsic::umin: case Intrinsic::umax: case Intrinsic::smin: @@ -651,6 +663,16 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return LT.first; break; } + case Intrinsic::fma: + case Intrinsic::fmuladd: { + // Given a fma or fmuladd, cost it the same as a fmul instruction which are + // usually the same for costs. TODO: Add fp16 and bf16 expansion costs. + Type *EltTy = RetTy->getScalarType(); + if (EltTy->isFloatTy() || EltTy->isDoubleTy() || + (EltTy->isHalfTy() && ST->hasFullFP16())) + return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind); + break; + } case Intrinsic::stepvector: { InstructionCost Cost = 1; // Cost of the `index' instruction auto LT = getTypeLegalizationCost(RetTy); @@ -3961,6 +3983,24 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I); } +InstructionCost +AArch64TTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { + if (isa<FixedVectorType>(Val)) + return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, + Index); + + // This typically requires both while and lastb instructions in order + // to extract the last element. If this is in a loop the while + // instruction can at least be hoisted out, although it will consume a + // predicate register. The cost should be more expensive than the base + // extract cost, which is 2 for most CPUs. + return CostKind == TTI::TCK_CodeSize + ? 2 + : ST->getVectorInsertExtractBaseCost() + 1; +} + InstructionCost AArch64TTIImpl::getScalarizationOverhead( VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc, @@ -3975,6 +4015,27 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead( return DemandedElts.popcount() * (Insert + Extract) * VecInstCost; } +std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost( + Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, + TTI::OperandValueInfo Op2Info, bool IncludeTrunc, + std::function<InstructionCost(Type *)> InstCost) const { + if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy()) + return std::nullopt; + if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16()) + return std::nullopt; + + Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext())); + InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty, + TTI::CastContextHint::None, CostKind); + if (!Op1Info.isConstant() && !Op2Info.isConstant()) + Cost *= 2; + Cost += InstCost(PromotedTy); + if (IncludeTrunc) + Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy, + TTI::CastContextHint::None, CostKind); + return Cost; +} + InstructionCost AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, @@ -3997,6 +4058,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); + // Increase the cost for half and bfloat types if not architecturally + // supported. + if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL || + ISD == ISD::FDIV || ISD == ISD::FREM) + if (auto PromotedCost = getFP16BF16PromoteCost( + Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true, + [&](Type *PromotedTy) { + return getArithmeticInstrCost(Opcode, PromotedTy, CostKind, + Op1Info, Op2Info); + })) + return *PromotedCost; + switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, @@ -4265,11 +4338,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( [[fallthrough]]; case ISD::FADD: case ISD::FSUB: - // Increase the cost for half and bfloat types if not architecturally - // supported. - if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) || - (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16())) - return 2 * LT.first; if (!Ty->getScalarType()->isFP128Ty()) return LT.first; [[fallthrough]]; @@ -4293,8 +4361,9 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( } InstructionCost -AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, - const SCEV *Ptr) const { +AArch64TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, + const SCEV *Ptr, + TTI::TargetCostKind CostKind) const { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -4302,7 +4371,7 @@ AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead; int MaxMergeDistance = 64; - if (Ty->isVectorTy() && SE && + if (PtrTy->isVectorTy() && SE && !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) return NumVectorInstToHideOverhead; @@ -4371,25 +4440,21 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost( } if (Opcode == Instruction::FCmp) { - // Without dedicated instructions we promote f16 + bf16 compares to f32. - if ((!ST->hasFullFP16() && ValTy->getScalarType()->isHalfTy()) || - ValTy->getScalarType()->isBFloatTy()) { - Type *PromotedTy = - ValTy->getWithNewType(Type::getFloatTy(ValTy->getContext())); - InstructionCost Cost = - getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy, - TTI::CastContextHint::None, CostKind); - if (!Op1Info.isConstant() && !Op2Info.isConstant()) - Cost *= 2; - Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind, - Op1Info, Op2Info); - if (ValTy->isVectorTy()) - Cost += getCastInstrCost( - Instruction::Trunc, VectorType::getInteger(cast<VectorType>(ValTy)), - VectorType::getInteger(cast<VectorType>(PromotedTy)), - TTI::CastContextHint::None, CostKind); - return Cost; - } + if (auto PromotedCost = getFP16BF16PromoteCost( + ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false, + [&](Type *PromotedTy) { + InstructionCost Cost = + getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, + CostKind, Op1Info, Op2Info); + if (isa<VectorType>(PromotedTy)) + Cost += getCastInstrCost( + Instruction::Trunc, + VectorType::getInteger(cast<VectorType>(ValTy)), + VectorType::getInteger(cast<VectorType>(PromotedTy)), + TTI::CastContextHint::None, CostKind); + return Cost; + })) + return *PromotedCost; auto LT = getTypeLegalizationCost(ValTy); // Model unknown fp compares as a libcall. @@ -4858,32 +4923,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, // Limit to loops with trip counts that are cheap to expand. UP.SCEVExpansionBudget = 1; - // Try to unroll small, single block loops, if they have load/store - // dependencies, to expose more parallel memory access streams. + // Try to unroll small loops, of few-blocks with low budget, if they have + // load/store dependencies, to expose more parallel memory access streams, + // or if they do little work inside a block (i.e. load -> X -> store pattern). BasicBlock *Header = L->getHeader(); - if (Header == L->getLoopLatch()) { + BasicBlock *Latch = L->getLoopLatch(); + if (Header == Latch) { // Estimate the size of the loop. unsigned Size; - if (!isLoopSizeWithinBudget(L, TTI, 8, &Size)) + unsigned Width = 10; + if (!isLoopSizeWithinBudget(L, TTI, Width, &Size)) return; - SmallPtrSet<Value *, 8> LoadedValues; - SmallVector<StoreInst *> Stores; - for (auto *BB : L->blocks()) { - for (auto &I : *BB) { - Value *Ptr = getLoadStorePointerOperand(&I); - if (!Ptr) - continue; - const SCEV *PtrSCEV = SE.getSCEV(Ptr); - if (SE.isLoopInvariant(PtrSCEV, L)) - continue; - if (isa<LoadInst>(&I)) - LoadedValues.insert(&I); - else - Stores.push_back(cast<StoreInst>(&I)); - } - } - // Try to find an unroll count that maximizes the use of the instruction // window, i.e. trying to fetch as many instructions per cycle as possible. unsigned MaxInstsPerLine = 16; @@ -4902,8 +4953,32 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, UC++; } - if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) { - return LoadedValues.contains(SI->getOperand(0)); + if (BestUC == 1) + return; + + SmallPtrSet<Value *, 8> LoadedValuesPlus; + SmallVector<StoreInst *> Stores; + for (auto *BB : L->blocks()) { + for (auto &I : *BB) { + Value *Ptr = getLoadStorePointerOperand(&I); + if (!Ptr) + continue; + const SCEV *PtrSCEV = SE.getSCEV(Ptr); + if (SE.isLoopInvariant(PtrSCEV, L)) + continue; + if (isa<LoadInst>(&I)) { + LoadedValuesPlus.insert(&I); + // Include in-loop 1st users of loaded values. + for (auto *U : I.users()) + if (L->contains(cast<Instruction>(U))) + LoadedValuesPlus.insert(U); + } else + Stores.push_back(cast<StoreInst>(&I)); + } + } + + if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) { + return LoadedValuesPlus.contains(SI->getOperand(0)); })) return; @@ -4915,7 +4990,6 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, // Try to runtime-unroll loops with early-continues depending on loop-varying // loads; this helps with branch-prediction for the early-continues. auto *Term = dyn_cast<BranchInst>(Header->getTerminator()); - auto *Latch = L->getLoopLatch(); SmallVector<BasicBlock *> Preds(predecessors(Latch)); if (!Term || !Term->isConditional() || Preds.size() == 1 || !llvm::is_contained(Preds, Header) || @@ -5151,6 +5225,8 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction( return false; switch (RdxDesc.getRecurrenceKind()) { + case RecurKind::Sub: + case RecurKind::AddChainWithSubs: case RecurKind::Add: case RecurKind::FAdd: case RecurKind::And: diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 7f45177..42ae962 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -221,6 +221,11 @@ public: unsigned Index) const override; InstructionCost + getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const override; + + InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override; @@ -238,8 +243,9 @@ public: ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr) const override; - InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, - const SCEV *Ptr) const override; + InstructionCost + getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, + TTI::TargetCostKind CostKind) const override; InstructionCost getCmpSelInstrCost( unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, @@ -435,6 +441,14 @@ public: bool preferPredicatedReductionSelect() const override { return ST->hasSVE(); } + /// FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the + /// architecture features are not present. + std::optional<InstructionCost> + getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, + TTI::OperandValueInfo Op1Info, + TTI::OperandValueInfo Op2Info, bool IncludeTrunc, + std::function<InstructionCost(Type *)> InstCost) const; + InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF, diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index 66136a4..803943f 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -89,6 +89,7 @@ add_llvm_target(AArch64CodeGen SMEABIPass.cpp SMEPeepholeOpt.cpp SVEIntrinsicOpts.cpp + MachineSMEABIPass.cpp AArch64SIMDInstrOpt.cpp DEPENDS diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 2155ace..79bef76 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -128,9 +128,9 @@ struct AArch64OutgoingValueAssigner if (!Flags.isVarArg() && !UseVarArgsCCForFixed) { if (!IsReturn) applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT); - Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); + Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, Info.Ty, State); } else - Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, State); + Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, Info.Ty, State); StackSize = State.getStackSize(); return Res; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index f359731..ee34a85 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -1349,7 +1349,9 @@ AArch64InstructionSelector::emitSelect(Register Dst, Register True, return &*SelectInst; } -static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { +static AArch64CC::CondCode +changeICMPPredToAArch64CC(CmpInst::Predicate P, Register RHS = {}, + MachineRegisterInfo *MRI = nullptr) { switch (P) { default: llvm_unreachable("Unknown condition code!"); @@ -1360,8 +1362,18 @@ static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { case CmpInst::ICMP_SGT: return AArch64CC::GT; case CmpInst::ICMP_SGE: + if (RHS && MRI) { + auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS, *MRI); + if (ValAndVReg && ValAndVReg->Value == 0) + return AArch64CC::PL; + } return AArch64CC::GE; case CmpInst::ICMP_SLT: + if (RHS && MRI) { + auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS, *MRI); + if (ValAndVReg && ValAndVReg->Value == 0) + return AArch64CC::MI; + } return AArch64CC::LT; case CmpInst::ICMP_SLE: return AArch64CC::LE; @@ -1813,7 +1825,8 @@ bool AArch64InstructionSelector::selectCompareBranchFedByICmp( auto &PredOp = ICmp.getOperand(1); emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB); const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( - static_cast<CmpInst::Predicate>(PredOp.getPredicate())); + static_cast<CmpInst::Predicate>(PredOp.getPredicate()), + ICmp.getOperand(3).getReg(), MIB.getMRI()); MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); I.eraseFromParent(); return true; @@ -2510,8 +2523,8 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) { emitIntegerCompare(/*LHS=*/Cmp->getOperand(2), /*RHS=*/Cmp->getOperand(3), PredOp, MIB); auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); - const AArch64CC::CondCode InvCC = - changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); + const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC( + CmpInst::getInversePredicate(Pred), Cmp->getOperand(3).getReg(), &MRI); emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB); I.eraseFromParent(); return true; @@ -3577,8 +3590,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { auto &PredOp = I.getOperand(1); emitIntegerCompare(I.getOperand(2), I.getOperand(3), PredOp, MIB); auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); - const AArch64CC::CondCode InvCC = - changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); + const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC( + CmpInst::getInversePredicate(Pred), I.getOperand(3).getReg(), &MRI); emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR, /*Src2=*/AArch64::WZR, InvCC, MIB); I.eraseFromParent(); @@ -4931,7 +4944,7 @@ MachineInstr *AArch64InstructionSelector::emitConjunctionRec( if (Negate) CC = CmpInst::getInversePredicate(CC); if (isa<GICmp>(Cmp)) { - OutCC = changeICMPPredToAArch64CC(CC); + OutCC = changeICMPPredToAArch64CC(CC, RHS, MIB.getMRI()); } else { // Handle special FP cases. AArch64CC::CondCode ExtraCC; @@ -5101,7 +5114,8 @@ bool AArch64InstructionSelector::tryOptSelect(GSelect &I) { emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), PredOp, MIB); auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); - CondCode = changeICMPPredToAArch64CC(Pred); + CondCode = + changeICMPPredToAArch64CC(Pred, CondDef->getOperand(3).getReg(), &MRI); } else { // Get the condition code for the select. auto Pred = diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 3ba08c8..6025f1c 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -614,8 +614,7 @@ tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P, // x uge c => x ugt c - 1 // // When c is not zero. - if (C == 0) - return std::nullopt; + assert(C != 0 && "C should not be zero here!"); P = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT; C -= 1; break; @@ -656,14 +655,13 @@ tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P, if (isLegalArithImmed(C)) return {{C, P}}; - auto IsMaterializableInSingleInstruction = [=](uint64_t Imm) { + auto NumberOfInstrToLoadImm = [=](uint64_t Imm) { SmallVector<AArch64_IMM::ImmInsnModel> Insn; AArch64_IMM::expandMOVImm(Imm, 32, Insn); - return Insn.size() == 1; + return Insn.size(); }; - if (!IsMaterializableInSingleInstruction(OriginalC) && - IsMaterializableInSingleInstruction(C)) + if (NumberOfInstrToLoadImm(OriginalC) > NumberOfInstrToLoadImm(C)) return {{C, P}}; return std::nullopt; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index 45ac023..a388216 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -40,6 +40,7 @@ protected: bool IsPCRel) const override; bool needsRelocateWithSymbol(const MCValue &, unsigned Type) const override; bool isNonILP32reloc(const MCFixup &Fixup, AArch64::Specifier RefKind) const; + void sortRelocs(std::vector<ELFRelocationEntry> &Relocs) override; bool IsILP32; }; @@ -498,6 +499,17 @@ bool AArch64ELFObjectWriter::needsRelocateWithSymbol(const MCValue &Val, Val.getSpecifier()); } +void AArch64ELFObjectWriter::sortRelocs( + std::vector<ELFRelocationEntry> &Relocs) { + // PATCHINST relocations should be applied last because they may overwrite the + // whole instruction and so should take precedence over other relocations that + // modify operands of the original instruction. + std::stable_partition(Relocs.begin(), Relocs.end(), + [](const ELFRelocationEntry &R) { + return R.Type != ELF::R_AARCH64_PATCHINST; + }); +} + std::unique_ptr<MCObjectTargetWriter> llvm::createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) { return std::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 14547e3..917dbdf 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -35,7 +35,6 @@ #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCWinCOFFStreamer.h" #include "llvm/Support/AArch64BuildAttributes.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/raw_ostream.h" diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 3c8b571..54b58e9 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -1017,14 +1017,22 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, else return false; + StringRef Reg = getRegisterName(MI->getOperand(4).getReg()); + bool NotXZR = Reg != "xzr"; + + // If a mandatory is not specified in the TableGen + // (i.e. no register operand should be present), and the register value + // is not xzr/x31, then disassemble to a SYS alias instead. + if (NotXZR && !NeedsReg) + return false; + std::string Str = Ins + Name; llvm::transform(Str, Str.begin(), ::tolower); O << '\t' << Str; - if (NeedsReg) { - O << ", "; - printRegName(O, MI->getOperand(4).getReg()); - } + + if (NeedsReg) + O << ", " << Reg; return true; } diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp new file mode 100644 index 0000000..b58dfdf --- /dev/null +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -0,0 +1,696 @@ +//===- MachineSMEABIPass.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements the SME ABI requirements for ZA state. This includes +// implementing the lazy ZA state save schemes around calls. +// +//===----------------------------------------------------------------------===// +// +// This pass works by collecting instructions that require ZA to be in a +// specific state (e.g., "ACTIVE" or "SAVED") and inserting the necessary state +// transitions to ensure ZA is in the required state before instructions. State +// transitions represent actions such as setting up or restoring a lazy save. +// Certain points within a function may also have predefined states independent +// of any instructions, for example, a "shared_za" function is always entered +// and exited in the "ACTIVE" state. +// +// To handle ZA state across control flow, we make use of edge bundling. This +// assigns each block an "incoming" and "outgoing" edge bundle (representing +// incoming and outgoing edges). Initially, these are unique to each block; +// then, in the process of forming bundles, the outgoing block of a block is +// joined with the incoming bundle of all successors. The result is that each +// bundle can be assigned a single ZA state, which ensures the state required by +// all a blocks' successors is the same, and that each basic block will always +// be entered with the same ZA state. This eliminates the need for splitting +// edges to insert state transitions or "phi" nodes for ZA states. +// +// See below for a simple example of edge bundling. +// +// The following shows a conditionally executed basic block (BB1): +// +// if (cond) +// BB1 +// BB2 +// +// Initial Bundles Joined Bundles +// +// ┌──0──┐ ┌──0──┐ +// │ BB0 │ │ BB0 │ +// └──1──┘ └──1──┘ +// ├───────┐ ├───────┐ +// ▼ │ ▼ │ +// ┌──2──┐ │ ─────► ┌──1──┐ │ +// │ BB1 │ ▼ │ BB1 │ ▼ +// └──3──┘ ┌──4──┐ └──1──┘ ┌──1──┐ +// └───►4 BB2 │ └───►1 BB2 │ +// └──5──┘ └──2──┘ +// +// On the left are the initial per-block bundles, and on the right are the +// joined bundles (which are the result of the EdgeBundles analysis). + +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/BitmaskEnum.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/EdgeBundles.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-machine-sme-abi" + +namespace { + +enum ZAState { + // Any/unknown state (not valid) + ANY = 0, + + // ZA is in use and active (i.e. within the accumulator) + ACTIVE, + + // A ZA save has been set up or committed (i.e. ZA is dormant or off) + LOCAL_SAVED, + + // ZA is off or a lazy save has been set up by the caller + CALLER_DORMANT, + + // ZA is off + OFF, + + // The number of ZA states (not a valid state) + NUM_ZA_STATE +}; + +/// A bitmask enum to record live physical registers that the "emit*" routines +/// may need to preserve. Note: This only tracks registers we may clobber. +enum LiveRegs : uint8_t { + None = 0, + NZCV = 1 << 0, + W0 = 1 << 1, + W0_HI = 1 << 2, + X0 = W0 | W0_HI, + LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ W0_HI) +}; + +/// Holds the virtual registers live physical registers have been saved to. +struct PhysRegSave { + LiveRegs PhysLiveRegs; + Register StatusFlags = AArch64::NoRegister; + Register X0Save = AArch64::NoRegister; +}; + +static bool isLegalEdgeBundleZAState(ZAState State) { + switch (State) { + case ZAState::ACTIVE: + case ZAState::LOCAL_SAVED: + return true; + default: + return false; + } +} +struct TPIDR2State { + int FrameIndex = -1; +}; + +StringRef getZAStateString(ZAState State) { +#define MAKE_CASE(V) \ + case V: \ + return #V; + switch (State) { + MAKE_CASE(ZAState::ANY) + MAKE_CASE(ZAState::ACTIVE) + MAKE_CASE(ZAState::LOCAL_SAVED) + MAKE_CASE(ZAState::CALLER_DORMANT) + MAKE_CASE(ZAState::OFF) + default: + llvm_unreachable("Unexpected ZAState"); + } +#undef MAKE_CASE +} + +static bool isZAorZT0RegOp(const TargetRegisterInfo &TRI, + const MachineOperand &MO) { + if (!MO.isReg() || !MO.getReg().isPhysical()) + return false; + return any_of(TRI.subregs_inclusive(MO.getReg()), [](const MCPhysReg &SR) { + return AArch64::MPR128RegClass.contains(SR) || + AArch64::ZTRRegClass.contains(SR); + }); +} + +/// Returns the required ZA state needed before \p MI and an iterator pointing +/// to where any code required to change the ZA state should be inserted. +static std::pair<ZAState, MachineBasicBlock::iterator> +getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI, + bool ZAOffAtReturn) { + MachineBasicBlock::iterator InsertPt(MI); + + if (MI.getOpcode() == AArch64::InOutZAUsePseudo) + return {ZAState::ACTIVE, std::prev(InsertPt)}; + + if (MI.getOpcode() == AArch64::RequiresZASavePseudo) + return {ZAState::LOCAL_SAVED, std::prev(InsertPt)}; + + if (MI.isReturn()) + return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt}; + + for (auto &MO : MI.operands()) { + if (isZAorZT0RegOp(TRI, MO)) + return {ZAState::ACTIVE, InsertPt}; + } + + return {ZAState::ANY, InsertPt}; +} + +struct MachineSMEABI : public MachineFunctionPass { + inline static char ID = 0; + + MachineSMEABI() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "Machine SME ABI pass"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<EdgeBundlesWrapperLegacy>(); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + /// Collects the needed ZA state (and live registers) before each instruction + /// within the machine function. + void collectNeededZAStates(SMEAttrs); + + /// Assigns each edge bundle a ZA state based on the needed states of blocks + /// that have incoming or outgoing edges in that bundle. + void assignBundleZAStates(); + + /// Inserts code to handle changes between ZA states within the function. + /// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA. + void insertStateChanges(); + + // Emission routines for private and shared ZA functions (using lazy saves). + void emitNewZAPrologue(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); + void emitRestoreLazySave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs); + void emitSetupLazySave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); + void emitAllocateLazySaveBuffer(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); + void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + bool ClearTPIDR2); + + void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + ZAState From, ZAState To, LiveRegs PhysLiveRegs); + + /// Save live physical registers to virtual registers. + PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL); + /// Restore physical registers from a save of their previous values. + void restorePhyRegSave(PhysRegSave const &RegSave, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL); + + /// Get or create a TPIDR2 block in this function. + TPIDR2State getTPIDR2Block(); + +private: + /// Contains the needed ZA state (and live registers) at an instruction. + struct InstInfo { + ZAState NeededState{ZAState::ANY}; + MachineBasicBlock::iterator InsertPt; + LiveRegs PhysLiveRegs = LiveRegs::None; + }; + + /// Contains the needed ZA state for each instruction in a block. + /// Instructions that do not require a ZA state are not recorded. + struct BlockInfo { + ZAState FixedEntryState{ZAState::ANY}; + SmallVector<InstInfo> Insts; + LiveRegs PhysLiveRegsAtExit = LiveRegs::None; + }; + + // All pass state that must be cleared between functions. + struct PassState { + SmallVector<BlockInfo> Blocks; + SmallVector<ZAState> BundleStates; + std::optional<TPIDR2State> TPIDR2Block; + } State; + + MachineFunction *MF = nullptr; + EdgeBundles *Bundles = nullptr; + const AArch64Subtarget *Subtarget = nullptr; + const AArch64RegisterInfo *TRI = nullptr; + const TargetInstrInfo *TII = nullptr; + MachineRegisterInfo *MRI = nullptr; +}; + +void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { + assert((SMEFnAttrs.hasZT0State() || SMEFnAttrs.hasZAState()) && + "Expected function to have ZA/ZT0 state!"); + + State.Blocks.resize(MF->getNumBlockIDs()); + for (MachineBasicBlock &MBB : *MF) { + BlockInfo &Block = State.Blocks[MBB.getNumber()]; + if (&MBB == &MF->front()) { + // Entry block: + Block.FixedEntryState = SMEFnAttrs.hasPrivateZAInterface() + ? ZAState::CALLER_DORMANT + : ZAState::ACTIVE; + } else if (MBB.isEHPad()) { + // EH entry block: + Block.FixedEntryState = ZAState::LOCAL_SAVED; + } + + LiveRegUnits LiveUnits(*TRI); + LiveUnits.addLiveOuts(MBB); + + auto GetPhysLiveRegs = [&] { + LiveRegs PhysLiveRegs = LiveRegs::None; + if (!LiveUnits.available(AArch64::NZCV)) + PhysLiveRegs |= LiveRegs::NZCV; + // We have to track W0 and X0 separately as otherwise things can get + // confused if we attempt to preserve X0 but only W0 was defined. + if (!LiveUnits.available(AArch64::W0)) + PhysLiveRegs |= LiveRegs::W0; + if (!LiveUnits.available(AArch64::W0_HI)) + PhysLiveRegs |= LiveRegs::W0_HI; + return PhysLiveRegs; + }; + + Block.PhysLiveRegsAtExit = GetPhysLiveRegs(); + auto FirstTerminatorInsertPt = MBB.getFirstTerminator(); + for (MachineInstr &MI : reverse(MBB)) { + MachineBasicBlock::iterator MBBI(MI); + LiveUnits.stepBackward(MI); + LiveRegs PhysLiveRegs = GetPhysLiveRegs(); + auto [NeededState, InsertPt] = getZAStateBeforeInst( + *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface()); + assert((InsertPt == MBBI || + InsertPt->getOpcode() == AArch64::ADJCALLSTACKDOWN) && + "Unexpected state change insertion point!"); + // TODO: Do something to avoid state changes where NZCV is live. + if (MBBI == FirstTerminatorInsertPt) + Block.PhysLiveRegsAtExit = PhysLiveRegs; + if (NeededState != ZAState::ANY) + Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs}); + } + + // Reverse vector (as we had to iterate backwards for liveness). + std::reverse(Block.Insts.begin(), Block.Insts.end()); + } +} + +void MachineSMEABI::assignBundleZAStates() { + State.BundleStates.resize(Bundles->getNumBundles()); + for (unsigned I = 0, E = Bundles->getNumBundles(); I != E; ++I) { + LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n'); + + // Attempt to assign a ZA state for this bundle that minimizes state + // transitions. Edges within loops are given a higher weight as we assume + // they will be executed more than once. + // TODO: We should propagate desired incoming/outgoing states through blocks + // that have the "ANY" state first to make better global decisions. + int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0}; + for (unsigned BlockID : Bundles->getBlocks(I)) { + LLVM_DEBUG(dbgs() << "- bb." << BlockID); + + const BlockInfo &Block = State.Blocks[BlockID]; + if (Block.Insts.empty()) { + LLVM_DEBUG(dbgs() << " (no state preference)\n"); + continue; + } + bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I; + bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I; + + ZAState DesiredIncomingState = Block.Insts.front().NeededState; + if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) { + EdgeStateCounts[DesiredIncomingState]++; + LLVM_DEBUG(dbgs() << " DesiredIncomingState: " + << getZAStateString(DesiredIncomingState)); + } + ZAState DesiredOutgoingState = Block.Insts.back().NeededState; + if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) { + EdgeStateCounts[DesiredOutgoingState]++; + LLVM_DEBUG(dbgs() << " DesiredOutgoingState: " + << getZAStateString(DesiredOutgoingState)); + } + LLVM_DEBUG(dbgs() << '\n'); + } + + ZAState BundleState = + ZAState(max_element(EdgeStateCounts) - EdgeStateCounts); + + // Force ZA to be active in bundles that don't have a preferred state. + // TODO: Something better here (to avoid extra mode switches). + if (BundleState == ZAState::ANY) + BundleState = ZAState::ACTIVE; + + LLVM_DEBUG({ + dbgs() << "Chosen ZA state: " << getZAStateString(BundleState) << '\n' + << "Edge counts:"; + for (auto [State, Count] : enumerate(EdgeStateCounts)) + dbgs() << " " << getZAStateString(ZAState(State)) << ": " << Count; + dbgs() << "\n\n"; + }); + + State.BundleStates[I] = BundleState; + } +} + +void MachineSMEABI::insertStateChanges() { + for (MachineBasicBlock &MBB : *MF) { + const BlockInfo &Block = State.Blocks[MBB.getNumber()]; + ZAState InState = State.BundleStates[Bundles->getBundle(MBB.getNumber(), + /*Out=*/false)]; + + ZAState CurrentState = Block.FixedEntryState; + if (CurrentState == ZAState::ANY) + CurrentState = InState; + + for (auto &Inst : Block.Insts) { + if (CurrentState != Inst.NeededState) + emitStateChange(MBB, Inst.InsertPt, CurrentState, Inst.NeededState, + Inst.PhysLiveRegs); + CurrentState = Inst.NeededState; + } + + if (MBB.succ_empty()) + continue; + + ZAState OutState = + State.BundleStates[Bundles->getBundle(MBB.getNumber(), /*Out=*/true)]; + if (CurrentState != OutState) + emitStateChange(MBB, MBB.getFirstTerminator(), CurrentState, OutState, + Block.PhysLiveRegsAtExit); + } +} + +TPIDR2State MachineSMEABI::getTPIDR2Block() { + if (State.TPIDR2Block) + return *State.TPIDR2Block; + MachineFrameInfo &MFI = MF->getFrameInfo(); + State.TPIDR2Block = TPIDR2State{MFI.CreateStackObject(16, Align(16), false)}; + return *State.TPIDR2Block; +} + +static DebugLoc getDebugLoc(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + if (MBBI != MBB.end()) + return MBBI->getDebugLoc(); + return DebugLoc(); +} + +void MachineSMEABI::emitSetupLazySave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + DebugLoc DL = getDebugLoc(MBB, MBBI); + + // Get pointer to TPIDR2 block. + Register TPIDR2 = MRI->createVirtualRegister(&AArch64::GPR64spRegClass); + Register TPIDR2Ptr = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2) + .addFrameIndex(getTPIDR2Block().FrameIndex) + .addImm(0) + .addImm(0); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), TPIDR2Ptr) + .addReg(TPIDR2); + // Set TPIDR2_EL0 to point to TPIDR2 block. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR)) + .addImm(AArch64SysReg::TPIDR2_EL0) + .addReg(TPIDR2Ptr); +} + +PhysRegSave MachineSMEABI::createPhysRegSave(LiveRegs PhysLiveRegs, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) { + PhysRegSave RegSave{PhysLiveRegs}; + if (PhysLiveRegs & LiveRegs::NZCV) { + RegSave.StatusFlags = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), RegSave.StatusFlags) + .addImm(AArch64SysReg::NZCV) + .addReg(AArch64::NZCV, RegState::Implicit); + } + // Note: Preserving X0 is "free" as this is before register allocation, so + // the register allocator is still able to optimize these copies. + if (PhysLiveRegs & LiveRegs::W0) { + RegSave.X0Save = MRI->createVirtualRegister(PhysLiveRegs & LiveRegs::W0_HI + ? &AArch64::GPR64RegClass + : &AArch64::GPR32RegClass); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), RegSave.X0Save) + .addReg(PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0); + } + return RegSave; +} + +void MachineSMEABI::restorePhyRegSave(PhysRegSave const &RegSave, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) { + if (RegSave.StatusFlags != AArch64::NoRegister) + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR)) + .addImm(AArch64SysReg::NZCV) + .addReg(RegSave.StatusFlags) + .addReg(AArch64::NZCV, RegState::ImplicitDefine); + + if (RegSave.X0Save != AArch64::NoRegister) + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), + RegSave.PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0) + .addReg(RegSave.X0Save); +} + +void MachineSMEABI::emitRestoreLazySave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs) { + auto *TLI = Subtarget->getTargetLowering(); + DebugLoc DL = getDebugLoc(MBB, MBBI); + Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + Register TPIDR2 = AArch64::X0; + + // TODO: Emit these within the restore MBB to prevent unnecessary saves. + PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL); + + // Enable ZA. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1)) + .addImm(AArch64SVCR::SVCRZA) + .addImm(1); + // Get current TPIDR2_EL0. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), TPIDR2EL0) + .addImm(AArch64SysReg::TPIDR2_EL0); + // Get pointer to TPIDR2 block. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2) + .addFrameIndex(getTPIDR2Block().FrameIndex) + .addImm(0) + .addImm(0); + // (Conditionally) restore ZA state. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::RestoreZAPseudo)) + .addReg(TPIDR2EL0) + .addReg(TPIDR2) + .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_RESTORE)) + .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); + // Zero TPIDR2_EL0. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR)) + .addImm(AArch64SysReg::TPIDR2_EL0) + .addReg(AArch64::XZR); + + restorePhyRegSave(RegSave, MBB, MBBI, DL); +} + +void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool ClearTPIDR2) { + DebugLoc DL = getDebugLoc(MBB, MBBI); + + if (ClearTPIDR2) + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR)) + .addImm(AArch64SysReg::TPIDR2_EL0) + .addReg(AArch64::XZR); + + // Disable ZA. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1)) + .addImm(AArch64SVCR::SVCRZA) + .addImm(0); +} + +void MachineSMEABI::emitAllocateLazySaveBuffer( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { + MachineFrameInfo &MFI = MF->getFrameInfo(); + + DebugLoc DL = getDebugLoc(MBB, MBBI); + Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + + // Calculate SVL. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1); + + // 1. Allocate the lazy save buffer. + { + // TODO This function grows the stack with a subtraction, which doesn't work + // on Windows. Some refactoring to share the functionality in + // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI + // supports SME + assert(!Subtarget->isTargetWindows() && + "Lazy ZA save is not yet supported on Windows"); + // Get original stack pointer. + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP) + .addReg(AArch64::SP); + // Allocate a lazy-save buffer object of the size given, normally SVL * SVL + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSUBXrrr), Buffer) + .addReg(SVL) + .addReg(SVL) + .addReg(SP); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::SP) + .addReg(Buffer); + // We have just allocated a variable sized object, tell this to PEI. + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + + // 2. Setup the TPIDR2 block. + { + // Note: This case just needs to do `SVL << 48`. It is not implemented as we + // generally don't support big-endian SVE/SME. + if (!Subtarget->isLittleEndian()) + reportFatalInternalError( + "TPIDR2 block initialization is not supported on big-endian targets"); + + // Store buffer pointer and num_za_save_slices. + // Bytes 10-15 are implicitly zeroed. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::STPXi)) + .addReg(Buffer) + .addReg(SVL) + .addFrameIndex(getTPIDR2Block().FrameIndex) + .addImm(0); + } +} + +void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + auto *TLI = Subtarget->getTargetLowering(); + DebugLoc DL = getDebugLoc(MBB, MBBI); + + // Get current TPIDR2_EL0. + Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS)) + .addReg(TPIDR2EL0, RegState::Define) + .addImm(AArch64SysReg::TPIDR2_EL0); + // If TPIDR2_EL0 is non-zero, commit the lazy save. + // NOTE: Functions that only use ZT0 don't need to zero ZA. + bool ZeroZA = + MF->getInfo<AArch64FunctionInfo>()->getSMEFnAttrs().hasZAState(); + auto CommitZASave = + BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo)) + .addReg(TPIDR2EL0) + .addImm(ZeroZA ? 1 : 0) + .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_SAVE)) + .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); + if (ZeroZA) + CommitZASave.addDef(AArch64::ZAB0, RegState::ImplicitDefine); + // Enable ZA (as ZA could have previously been in the OFF state). + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1)) + .addImm(AArch64SVCR::SVCRZA) + .addImm(1); +} + +void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, + ZAState From, ZAState To, + LiveRegs PhysLiveRegs) { + + // ZA not used. + if (From == ZAState::ANY || To == ZAState::ANY) + return; + + // If we're exiting from the CALLER_DORMANT state that means this new ZA + // function did not touch ZA (so ZA was never turned on). + if (From == ZAState::CALLER_DORMANT && To == ZAState::OFF) + return; + + // TODO: Avoid setting up the save buffer if there's no transition to + // LOCAL_SAVED. + if (From == ZAState::CALLER_DORMANT) { + assert(MBB.getParent() + ->getInfo<AArch64FunctionInfo>() + ->getSMEFnAttrs() + .hasPrivateZAInterface() && + "CALLER_DORMANT state requires private ZA interface"); + assert(&MBB == &MBB.getParent()->front() && + "CALLER_DORMANT state only valid in entry block"); + emitNewZAPrologue(MBB, MBB.getFirstNonPHI()); + if (To == ZAState::ACTIVE) + return; // Nothing more to do (ZA is active after the prologue). + + // Note: "emitNewZAPrologue" zeros ZA, so we may need to setup a lazy save + // if "To" is "ZAState::LOCAL_SAVED". It may be possible to improve this + // case by changing the placement of the zero instruction. + From = ZAState::ACTIVE; + } + + if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED) + emitSetupLazySave(MBB, InsertPt); + else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE) + emitRestoreLazySave(MBB, InsertPt, PhysLiveRegs); + else if (To == ZAState::OFF) { + assert(From != ZAState::CALLER_DORMANT && + "CALLER_DORMANT to OFF should have already been handled"); + emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED); + } else { + dbgs() << "Error: Transition from " << getZAStateString(From) << " to " + << getZAStateString(To) << '\n'; + llvm_unreachable("Unimplemented state transition"); + } +} + +} // end anonymous namespace + +INITIALIZE_PASS(MachineSMEABI, "aarch64-machine-sme-abi", "Machine SME ABI", + false, false) + +bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { + if (!MF.getSubtarget<AArch64Subtarget>().hasSME()) + return false; + + auto *AFI = MF.getInfo<AArch64FunctionInfo>(); + SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs(); + if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State()) + return false; + + assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!"); + + // Reset pass state. + State = PassState{}; + this->MF = &MF; + Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles(); + Subtarget = &MF.getSubtarget<AArch64Subtarget>(); + TII = Subtarget->getInstrInfo(); + TRI = Subtarget->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + collectNeededZAStates(SMEFnAttrs); + assignBundleZAStates(); + insertStateChanges(); + + // Allocate save buffer (if needed). + if (State.TPIDR2Block) { + MachineBasicBlock &EntryBlock = MF.front(); + emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI()); + } + + return true; +} + +FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); } diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp index 4af4d49..2008516 100644 --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -15,11 +15,16 @@ #include "AArch64.h" #include "Utils/AArch64SMEAttributes.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/RuntimeLibcalls.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/Cloning.h" using namespace llvm; @@ -33,9 +38,13 @@ struct SMEABI : public FunctionPass { bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); + } + private: bool updateNewStateFunctions(Module *M, Function *F, IRBuilder<> &Builder, - SMEAttrs FnAttrs); + SMEAttrs FnAttrs, const TargetLowering &TLI); }; } // end anonymous namespace @@ -51,14 +60,16 @@ FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); } //===----------------------------------------------------------------------===// // Utility function to emit a call to __arm_tpidr2_save and clear TPIDR2_EL0. -void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) { +void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, const TargetLowering &TLI, + bool ZT0IsUndef = false) { auto &Ctx = M->getContext(); auto *TPIDR2SaveTy = FunctionType::get(Builder.getVoidTy(), {}, /*IsVarArgs=*/false); auto Attrs = AttributeList().addFnAttribute(Ctx, "aarch64_pstate_sm_compatible"); + RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_SAVE; FunctionCallee Callee = - M->getOrInsertFunction("__arm_tpidr2_save", TPIDR2SaveTy, Attrs); + M->getOrInsertFunction(TLI.getLibcallName(LC), TPIDR2SaveTy, Attrs); CallInst *Call = Builder.CreateCall(Callee); // If ZT0 is undefined (i.e. we're at the entry of a "new_zt0" function), mark @@ -67,8 +78,7 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) { if (ZT0IsUndef) Call->addFnAttr(Attribute::get(Ctx, "aarch64_zt0_undef")); - Call->setCallingConv( - CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0); + Call->setCallingConv(TLI.getLibcallCallingConv(LC)); // A save to TPIDR2 should be followed by clearing TPIDR2_EL0. Function *WriteIntr = @@ -98,7 +108,8 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) { /// interface if it does not share ZA or ZT0. /// bool SMEABI::updateNewStateFunctions(Module *M, Function *F, - IRBuilder<> &Builder, SMEAttrs FnAttrs) { + IRBuilder<> &Builder, SMEAttrs FnAttrs, + const TargetLowering &TLI) { LLVMContext &Context = F->getContext(); BasicBlock *OrigBB = &F->getEntryBlock(); Builder.SetInsertPoint(&OrigBB->front()); @@ -124,7 +135,7 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F, // Create a call __arm_tpidr2_save, which commits the lazy save. Builder.SetInsertPoint(&SaveBB->back()); - emitTPIDR2Save(M, Builder, /*ZT0IsUndef=*/FnAttrs.isNewZT0()); + emitTPIDR2Save(M, Builder, TLI, /*ZT0IsUndef=*/FnAttrs.isNewZT0()); // Enable pstate.za at the start of the function. Builder.SetInsertPoint(&OrigBB->front()); @@ -172,10 +183,14 @@ bool SMEABI::runOnFunction(Function &F) { if (F.isDeclaration() || F.hasFnAttribute("aarch64_expanded_pstate_za")) return false; + const TargetMachine &TM = + getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); + const TargetLowering &TLI = *TM.getSubtargetImpl(F)->getTargetLowering(); + bool Changed = false; SMEAttrs FnAttrs(F); if (FnAttrs.isNewZA() || FnAttrs.isNewZT0()) - Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs); + Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs, TLI); return Changed; } diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp index bd28716..564af67 100644 --- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp @@ -80,16 +80,10 @@ static bool isMatchingStartStopPair(const MachineInstr *MI1, if (MI1->getOperand(4).getRegMask() != MI2->getOperand(4).getRegMask()) return false; - // This optimisation is unlikely to happen in practice for conditional - // smstart/smstop pairs as the virtual registers for pstate.sm will always - // be different. - // TODO: For this optimisation to apply to conditional smstart/smstop, - // this pass will need to do more work to remove redundant calls to - // __arm_sme_state. - // Only consider conditional start/stop pairs which read the same register - // holding the original value of pstate.sm, as some conditional start/stops - // require the state on entry to the function. + // holding the original value of pstate.sm. This is somewhat over conservative + // as all conditional streaming mode changes only look at the state on entry + // to the function. if (MI1->getOperand(3).isReg() && MI2->getOperand(3).isReg()) { Register Reg1 = MI1->getOperand(3).getReg(); Register Reg2 = MI2->getOperand(3).getReg(); diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index a0320f9..a3a7d0f 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -809,6 +809,11 @@ let hasNoSchedulingInfo = 1 in { Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zs3), []> { let FalseLanes = flags; } + + class UnpredRegImmPseudo<ZPRRegOp zprty, Operand immty> + : SVEPseudo2Instr<NAME, 0>, + Pseudo<(outs zprty:$Zd), (ins zprty:$Zs, immty:$imm), []> { + } } // @@ -1885,13 +1890,14 @@ class sve_int_perm_extract_i<string asm> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = DestructiveOther; + let DestructiveInstType = Destructive2xRegImmUnpred; let ElementSize = ElementSizeNone; let hasSideEffects = 0; } -multiclass sve_int_perm_extract_i<string asm, SDPatternOperator op> { - def NAME : sve_int_perm_extract_i<asm>; +multiclass sve_int_perm_extract_i<string asm, SDPatternOperator op, string Ps> { + def NAME : sve_int_perm_extract_i<asm>, + SVEPseudo2Instr<Ps, 1>; def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, imm0_255, !cast<Instruction>(NAME)>; diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp index 271094f..dd6fa16 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -7,17 +7,14 @@ //===----------------------------------------------------------------------===// #include "AArch64SMEAttributes.h" +#include "AArch64ISelLowering.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/RuntimeLibcalls.h" #include <cassert> using namespace llvm; -void SMEAttrs::set(unsigned M, bool Enable) { - if (Enable) - Bitmask |= M; - else - Bitmask &= ~M; - +void SMEAttrs::validate() const { // Streaming Mode Attrs assert(!(hasStreamingInterface() && hasStreamingCompatibleInterface()) && "SM_Enabled and SM_Compatible are mutually exclusive"); @@ -77,19 +74,36 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) { Bitmask |= encodeZT0State(StateValue::New); } -void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName) { +void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName, + const AArch64TargetLowering &TLI) { + RTLIB::LibcallImpl Impl = TLI.getSupportedLibcallImpl(FuncName); + if (Impl == RTLIB::Unsupported) + return; unsigned KnownAttrs = SMEAttrs::Normal; - if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state") - KnownAttrs |= (SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine); - if (FuncName == "__arm_tpidr2_restore") + RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl); + switch (LC) { + case RTLIB::SMEABI_SME_STATE: + case RTLIB::SMEABI_TPIDR2_SAVE: + case RTLIB::SMEABI_GET_CURRENT_VG: + case RTLIB::SMEABI_SME_STATE_SIZE: + case RTLIB::SMEABI_SME_SAVE: + case RTLIB::SMEABI_SME_RESTORE: + KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine; + break; + case RTLIB::SMEABI_ZA_DISABLE: + case RTLIB::SMEABI_TPIDR2_RESTORE: KnownAttrs |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) | SMEAttrs::SME_ABI_Routine; - if (FuncName == "__arm_sc_memcpy" || FuncName == "__arm_sc_memset" || - FuncName == "__arm_sc_memmove" || FuncName == "__arm_sc_memchr") + break; + case RTLIB::SC_MEMCPY: + case RTLIB::SC_MEMMOVE: + case RTLIB::SC_MEMSET: + case RTLIB::SC_MEMCHR: KnownAttrs |= SMEAttrs::SM_Compatible; - if (FuncName == "__arm_sme_save" || FuncName == "__arm_sme_restore" || - FuncName == "__arm_sme_state_size") - KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine; + break; + default: + break; + } set(KnownAttrs); } @@ -110,11 +124,11 @@ bool SMECallAttrs::requiresSMChange() const { return true; } -SMECallAttrs::SMECallAttrs(const CallBase &CB) +SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI) : CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal), Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) { if (auto *CalledFunction = CB.getCalledFunction()) - CalledFn = SMEAttrs(*CalledFunction, SMEAttrs::InferAttrsFromName::Yes); + CalledFn = SMEAttrs(*CalledFunction, TLI); // FIXME: We probably should not allow SME attributes on direct calls but // clang duplicates streaming mode attributes at each callsite. diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index f1be0ecb..d26e3cd 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -13,6 +13,8 @@ namespace llvm { +class AArch64TargetLowering; + class Function; class CallBase; class AttributeList; @@ -48,19 +50,27 @@ public: CallSiteFlags_Mask = ZT0_Undef }; - enum class InferAttrsFromName { No, Yes }; - SMEAttrs() = default; SMEAttrs(unsigned Mask) { set(Mask); } - SMEAttrs(const Function &F, InferAttrsFromName Infer = InferAttrsFromName::No) + SMEAttrs(const Function &F, const AArch64TargetLowering *TLI = nullptr) : SMEAttrs(F.getAttributes()) { - if (Infer == InferAttrsFromName::Yes) - addKnownFunctionAttrs(F.getName()); + if (TLI) + addKnownFunctionAttrs(F.getName(), *TLI); } SMEAttrs(const AttributeList &L); - SMEAttrs(StringRef FuncName) { addKnownFunctionAttrs(FuncName); }; + SMEAttrs(StringRef FuncName, const AArch64TargetLowering &TLI) { + addKnownFunctionAttrs(FuncName, TLI); + }; - void set(unsigned M, bool Enable = true); + void set(unsigned M, bool Enable = true) { + if (Enable) + Bitmask |= M; + else + Bitmask &= ~M; +#ifndef NDEBUG + validate(); +#endif + } // Interfaces to query PSTATE.SM bool hasStreamingBody() const { return Bitmask & SM_Body; } @@ -146,7 +156,9 @@ public: } private: - void addKnownFunctionAttrs(StringRef FuncName); + void addKnownFunctionAttrs(StringRef FuncName, + const AArch64TargetLowering &TLI); + void validate() const; }; /// SMECallAttrs is a utility class to hold the SMEAttrs for a callsite. It has @@ -163,7 +175,7 @@ public: SMEAttrs Callsite = SMEAttrs::Normal) : CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {} - SMECallAttrs(const CallBase &CB); + SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI); SMEAttrs &caller() { return CallerFn; } SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; } @@ -194,7 +206,7 @@ public: } bool requiresEnablingZAAfterCall() const { - return requiresLazySave() || requiresDisablingZABeforeCall(); + return requiresDisablingZABeforeCall(); } bool requiresPreservingAllZAState() const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 007b481..0059a86 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -439,10 +439,6 @@ struct AMDGPUPrintfRuntimeBindingPass PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; -struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> { - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); -}; - void initializeSIOptimizeExecMaskingPreRALegacyPass(PassRegistry &); extern char &SIOptimizeExecMaskingPreRAID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index f266398..8e4b636 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1548,7 +1548,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", "gfx12", - [FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128, + [FeatureFP64, FeatureMIMG_R128, FeatureFlatAddressSpace, Feature16BitInsts, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts, @@ -1977,6 +1977,7 @@ def FeatureISAVersion11_5_3 : FeatureSet< def FeatureISAVersion12 : FeatureSet< [FeatureGFX12, + FeatureAddressableLocalMemorySize65536, FeatureLDSBankCount32, FeatureDLInsts, FeatureDot7Insts, @@ -2019,6 +2020,7 @@ def FeatureISAVersion12_50 : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, FeatureCUStores, + FeatureAddressableLocalMemorySize327680, FeatureCuMode, Feature64BitLiterals, FeatureLDSBankCount32, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 2a324e5..69722bd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -41,6 +41,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCValue.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/Compiler.h" @@ -719,6 +720,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { IsLocal), RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext, IsLocal), + RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier, + OutContext, IsLocal), RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal), RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext, @@ -733,6 +736,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutContext, IsLocal)); } + // Emit _dvgpr$ symbol when appropriate. + emitDVgprSymbol(MF); + if (isVerbose()) { MCSectionELF *CommentSection = Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); @@ -803,6 +809,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { " AccumOffset: " + getMCExprStr(AdjustedAccum), false); } + if (AMDGPU::isGFX1250(STM)) { + const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx); + const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo( + CurrentProgramInfo.NamedBarCnt, BarBlkConst, Ctx); + const MCExpr *BarBlks = + MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx); + OutStreamer->emitRawComment(" NamedBarCnt: " + getMCExprStr(BarBlks), + false); + } + OutStreamer->emitRawComment( " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false); @@ -875,6 +891,49 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { return false; } +// When appropriate, add a _dvgpr$ symbol, with the value of the function +// symbol, plus an offset encoding one less than the number of VGPR blocks used +// by the function in bits 5..3 of the symbol value. A "VGPR block" can be +// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is +// used by a front-end to have functions that are chained rather than called, +// and a dispatcher that dynamically resizes the VGPR count before dispatching +// to a function. +void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) { + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + if (MFI.isDynamicVGPREnabled() && + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS_Chain) { + MCContext &Ctx = MF.getContext(); + unsigned BlockSize = MFI.getDynamicVGPRBlockSize(); + MCValue NumVGPRs; + if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable( + NumVGPRs, nullptr) || + !NumVGPRs.isAbsolute()) { + llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol"); + } + // Calculate number of VGPR blocks. + // Treat 0 VGPRs as 1 VGPR to avoid underflowing. + unsigned NumBlocks = + divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize); + + if (NumBlocks > 8) { + OutContext.reportError({}, + "too many DVGPR blocks for _dvgpr$ symbol for '" + + Twine(CurrentFnSym->getName()) + "'"); + return; + } + unsigned EncodedNumBlocks = (NumBlocks - 1) << 3; + // Add to function symbol to create _dvgpr$ symbol. + const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd( + MCSymbolRefExpr::create(CurrentFnSym, Ctx), + MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx); + MCSymbol *DVgprFuncSym = + Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName()); + OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal); + emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility()); + emitLinkage(&MF.getFunction(), DVgprFuncSym); + } +} + // TODO: Fold this into emitFunctionBodyStart. void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { // In the beginning all features are either 'Any' or 'NotSupported', @@ -964,6 +1023,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.DynamicCallStack = MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack), GetSymRefExpr(RIK::RIK_HasRecursion), Ctx); + ProgInfo.NamedBarCnt = GetSymRefExpr(RIK::RIK_NumNamedBarrier); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -997,89 +1057,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const Function &F = MF.getFunction(); // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave - // dispatch registers are function args. - unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; - - if (isShader(F.getCallingConv())) { - bool IsPixelShader = - F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); - - // Calculate the number of VGPR registers based on the SPI input registers - uint32_t InputEna = 0; - uint32_t InputAddr = 0; - unsigned LastEna = 0; - - if (IsPixelShader) { - // Note for IsPixelShader: - // By this stage, all enabled inputs are tagged in InputAddr as well. - // We will use InputAddr to determine whether the input counts against the - // vgpr total and only use the InputEnable to determine the last input - // that is relevant - if extra arguments are used, then we have to honour - // the InputAddr for any intermediate non-enabled inputs. - InputEna = MFI->getPSInputEnable(); - InputAddr = MFI->getPSInputAddr(); - - // We only need to consider input args up to the last used arg. - assert((InputEna || InputAddr) && - "PSInputAddr and PSInputEnable should " - "never both be 0 for AMDGPU_PS shaders"); - // There are some rare circumstances where InputAddr is non-zero and - // InputEna can be set to 0. In this case we default to setting LastEna - // to 1. - LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1; - } + // dispatch registers as function args. + unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(), + WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs(); - // FIXME: We should be using the number of registers determined during - // calling convention lowering to legalize the types. - const DataLayout &DL = F.getDataLayout(); - unsigned PSArgCount = 0; - unsigned IntermediateVGPR = 0; - for (auto &Arg : F.args()) { - unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32; - if (Arg.hasAttribute(Attribute::InReg)) { - WaveDispatchNumSGPR += NumRegs; - } else { - // If this is a PS shader and we're processing the PS Input args (first - // 16 VGPR), use the InputEna and InputAddr bits to define how many - // VGPRs are actually used. - // Any extra VGPR arguments are handled as normal arguments (and - // contribute to the VGPR count whether they're used or not). - if (IsPixelShader && PSArgCount < 16) { - if ((1 << PSArgCount) & InputAddr) { - if (PSArgCount < LastEna) - WaveDispatchNumVGPR += NumRegs; - else - IntermediateVGPR += NumRegs; - } - PSArgCount++; - } else { - // If there are extra arguments we have to include the allocation for - // the non-used (but enabled with InputAddr) input arguments - if (IntermediateVGPR) { - WaveDispatchNumVGPR += IntermediateVGPR; - IntermediateVGPR = 0; - } - WaveDispatchNumVGPR += NumRegs; - } - } - } + if (WaveDispatchNumSGPR) { ProgInfo.NumSGPR = AMDGPUMCExpr::createMax( - {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx); + {ProgInfo.NumSGPR, + MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs, + Ctx)}, + Ctx); + } + if (WaveDispatchNumVGPR) { ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax( {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx); ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); - } else if (isKernel(F.getCallingConv()) && - MFI->getNumKernargPreloadedSGPRs()) { - // Consider cases where the total number of UserSGPRs with trailing - // allocated preload SGPRs, is greater than the number of explicitly - // referenced SGPRs. - const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd( - CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx); - ProgInfo.NumSGPR = - AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx); } // Adjust number of registers used to meet default/requested minimum/maximum @@ -1168,7 +1163,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.DX10Clamp = Mode.DX10Clamp; unsigned LDSAlignShift; - if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) { + if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) { + // LDS is allocated in 256 dword blocks. + LDSAlignShift = 10; + } else if (STM.getFeatureBits().test( + FeatureAddressableLocalMemorySize163840)) { // LDS is allocated in 320 dword blocks. LDSAlignShift = 11; } else if (STM.getFeatureBits().test( @@ -1205,8 +1204,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, CreateExpr(STM.getWavefrontSize()), Ctx), CreateExpr(1ULL << ScratchAlignShift)); - if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { + if (STM.supportsWGP()) { ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; + } + + if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { ProgInfo.MemOrdered = 1; ProgInfo.FwdProgress = 1; } @@ -1264,6 +1266,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT); } + if (AMDGPU::isGFX1250(STM)) + ProgInfo.ComputePGMRSrc3 = + SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt, + amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT, + amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT); + ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy( STM.computeOccupancy(F, ProgInfo.LDSSize).second, ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 63589d2..9e854fa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -54,6 +54,9 @@ private: MCCodeEmitter *DumpCodeInstEmitter = nullptr; + // When appropriate, add a _dvgpr$ symbol. + void emitDVgprSymbol(MachineFunction &MF); + void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF); void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out, const SIProgramInfo &KernelInfo, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 3d8d274..d1a5b4e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( ++i; } + if (Info->getNumKernargPreloadedSGPRs()) + Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs()); + TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); return true; @@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (!determineAssignments(Assigner, SplitArgs, CCInfo)) return false; + if (IsEntryFunc) { + // This assumes the registers are allocated by CCInfo in ascending order + // with no gaps. + Info->setNumWaveDispatchSGPRs( + CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters())); + Info->setNumWaveDispatchVGPRs( + CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters())); + } + FormalArgHandler Handler(B, MRI); if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B)) return false; @@ -1464,9 +1476,22 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const { if (Function *F = Info.CB->getCalledFunction()) if (F->isIntrinsic()) { - assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain && - "Unexpected intrinsic"); - return lowerChainCall(MIRBuilder, Info); + switch (F->getIntrinsicID()) { + case Intrinsic::amdgcn_cs_chain: + return lowerChainCall(MIRBuilder, Info); + case Intrinsic::amdgcn_call_whole_wave: + Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave; + + // Get the callee from the original instruction, so it doesn't look like + // this is an indirect call. + Info.Callee = MachineOperand::CreateGA( + cast<GlobalValue>(Info.CB->getOperand(0)), /*Offset=*/0); + Info.OrigArgs.erase(Info.OrigArgs.begin()); + Info.IsVarArg = false; + break; + default: + llvm_unreachable("Unexpected intrinsic call"); + } } if (Info.IsVarArg) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td index 74d1fae..d14b5ce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -30,6 +30,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature< def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>; def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>; def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>; +def FeatureAddressableLocalMemorySize327680 : SubtargetFeatureAddressableLocalMemorySize<327680>; class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature< "wavefrontsize"#!shl(1, ValueLog2), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 9d6584a..04c4d00 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -76,6 +76,40 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) { return false; } +static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src, + llvm::SelectionDAG *CurDAG, + const GCNSubtarget *Subtarget) { + if (!Subtarget->useRealTrue16Insts()) { + return Lo; + } + + SDValue NewSrc; + SDLoc SL(Lo); + + if (Lo->isDivergent()) { + SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + SL, Lo.getValueType()), + 0); + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo, + CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef, + CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)}; + + NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL, + Src.getValueType(), Ops), + 0); + } else { + // the S_MOV is needed since the Lo could still be a VGPR16. + // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on + // the fixvgpr2sgprcopy pass to legalize it + NewSrc = SDValue( + CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo), + 0); + } + + return NewSrc; +} + // Look through operations that obscure just looking at the low 16-bits of the // same register. static SDValue stripExtractLoElt(SDValue In) { @@ -1162,18 +1196,25 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) { SDLoc SL(N); bool Signed = N->getOpcode() == ISD::SMUL_LOHI; + SDVTList VTList; unsigned Opc; - if (Subtarget->hasMADIntraFwdBug()) - Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 - : AMDGPU::V_MAD_U64_U32_gfx11_e64; - else - Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; + if (Subtarget->hasMadU64U32NoCarry()) { + VTList = CurDAG->getVTList(MVT::i64); + Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64; + } else { + VTList = CurDAG->getVTList(MVT::i64, MVT::i1); + if (Subtarget->hasMADIntraFwdBug()) { + Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 + : AMDGPU::V_MAD_U64_U32_gfx11_e64; + } else { + Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; + } + } SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64); SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp}; - SDNode *Mad = CurDAG->getMachineNode( - Opc, SL, CurDAG->getVTList(MVT::i64, MVT::i1), Ops); + SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops); if (!SDValue(N, 0).use_empty()) { SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32); SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL, @@ -3412,8 +3453,10 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, // Really a scalar input. Just select from the low half of the register to // avoid packing. - if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) { + if (VecSize == Lo.getValueSizeInBits()) { Src = Lo; + } else if (VecSize == 32) { + Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget); } else { assert(Lo.getValueSizeInBits() == 32 && VecSize == 64); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 64e68ab..8ccd8fc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1512,9 +1512,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, const GlobalValue *GV = G->getGlobal(); if (!MFI->isModuleEntryFunction()) { + auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV)); if (std::optional<uint32_t> Address = AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) { + if (IsNamedBarrier) { + unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; + MFI->recordNumNamedBarriers(Address.value(), BarCnt); + } return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType()); + } else if (IsNamedBarrier) { + llvm_unreachable("named barrier should have an assigned address"); } } @@ -1802,16 +1809,36 @@ std::pair<SDValue, SDValue> AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT, SelectionDAG &DAG) const { + EVT VT = N.getValueType(); assert(LoVT.getVectorNumElements() + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= - N.getValueType().getVectorNumElements() && + VT.getVectorNumElements() && "More vector elements requested than available!"); SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, DAG.getVectorIdxConstant(0, DL)); - SDValue Hi = DAG.getNode( - HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL, - HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL)); - return std::pair(Lo, Hi); + + unsigned LoNumElts = LoVT.getVectorNumElements(); + + if (HiVT.isVector()) { + unsigned HiNumElts = HiVT.getVectorNumElements(); + if ((VT.getVectorNumElements() % HiNumElts) == 0) { + // Avoid creating an extract_subvector with an index that isn't a multiple + // of the result type. + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N, + DAG.getConstant(LoNumElts, DL, MVT::i32)); + return {Lo, Hi}; + } + + SmallVector<SDValue, 8> Elts; + DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts, + /*Count=*/HiNumElts); + SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts); + return {Lo, Hi}; + } + + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, HiVT, N, + DAG.getVectorIdxConstant(LoNumElts, DL)); + return {Lo, Hi}; } SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, @@ -4002,7 +4029,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: case Intrinsic::amdgcn_rsq_clamp: - case Intrinsic::amdgcn_tanh: { + case Intrinsic::amdgcn_tanh: + case Intrinsic::amdgcn_prng_b32: { // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted SDValue Src = N->getOperand(1); return Src.isUndef() ? Src : SDValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index b7fd131..5d31eed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2368,8 +2368,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: return selectDSBvhStackIntrinsic(I); + case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: return selectNamedBarrierInit(I, IntrinsicID); + case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_get_named_barrier_state: return selectNamedBarrierInst(I, IntrinsicID); case Intrinsic::amdgcn_s_get_barrier_state: @@ -5521,11 +5523,18 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, Register PtrBase; int64_t ConstOffset; - std::tie(PtrBase, ConstOffset) = + bool IsInBounds; + std::tie(PtrBase, ConstOffset, IsInBounds) = getPtrBaseWithConstantOffset(Root.getReg(), *MRI); - if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch && - !isFlatScratchBaseLegal(Root.getReg()))) + // Adding the offset to the base address with an immediate in a FLAT + // instruction must not change the memory aperture in which the address falls. + // Therefore we can only fold offsets from inbounds GEPs into FLAT + // instructions. + if (ConstOffset == 0 || + (FlatVariant == SIInstrFlags::FlatScratch && + !isFlatScratchBaseLegal(Root.getReg())) || + (FlatVariant == SIInstrFlags::FLAT && !IsInBounds)) return Default; unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); @@ -5577,7 +5586,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, // Match the immediate offset first, which canonically is moved as low as // possible. - std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + std::tie(PtrBase, ConstOffset, std::ignore) = + getPtrBaseWithConstantOffset(Addr, *MRI); if (ConstOffset != 0) { if (NeedIOffset && @@ -5760,7 +5770,8 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { // Match the immediate offset first, which canonically is moved as low as // possible. - std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + std::tie(PtrBase, ConstOffset, std::ignore) = + getPtrBaseWithConstantOffset(Addr, *MRI); if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) && TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, @@ -5836,7 +5847,8 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { // Match the immediate offset first, which canonically is moved as low as // possible. - std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + std::tie(PtrBase, ConstOffset, std::ignore) = + getPtrBaseWithConstantOffset(Addr, *MRI); Register OrigAddr = Addr; if (ConstOffset != 0 && @@ -5942,7 +5954,8 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); Register PtrBase; int64_t ConstOffset; - std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI); + std::tie(PtrBase, ConstOffset, std::ignore) = + getPtrBaseWithConstantOffset(VAddr, *MRI); if (ConstOffset != 0) { if (TII.isLegalMUBUFImmOffset(ConstOffset) && (!STI.privateMemoryResourceIsRangeChecked() || @@ -6181,8 +6194,8 @@ AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const Register PtrBase; int64_t Offset; - std::tie(PtrBase, Offset) = - getPtrBaseWithConstantOffset(Root.getReg(), *MRI); + std::tie(PtrBase, Offset, std::ignore) = + getPtrBaseWithConstantOffset(Root.getReg(), *MRI); if (Offset) { if (isDSOffsetLegal(PtrBase, Offset)) { @@ -6243,8 +6256,8 @@ AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, Register PtrBase; int64_t Offset; - std::tie(PtrBase, Offset) = - getPtrBaseWithConstantOffset(Root.getReg(), *MRI); + std::tie(PtrBase, Offset, std::ignore) = + getPtrBaseWithConstantOffset(Root.getReg(), *MRI); if (Offset) { int64_t OffsetValue0 = Offset; @@ -6265,22 +6278,25 @@ AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, } /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return -/// the base value with the constant offset. There may be intervening copies -/// between \p Root and the identified constant. Returns \p Root, 0 if this does -/// not match the pattern. -std::pair<Register, int64_t> +/// the base value with the constant offset, and if the offset computation is +/// known to be inbounds. There may be intervening copies between \p Root and +/// the identified constant. Returns \p Root, 0, false if this does not match +/// the pattern. +std::tuple<Register, int64_t, bool> AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( - Register Root, const MachineRegisterInfo &MRI) const { + Register Root, const MachineRegisterInfo &MRI) const { MachineInstr *RootI = getDefIgnoringCopies(Root, MRI); if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) - return {Root, 0}; + return {Root, 0, false}; MachineOperand &RHS = RootI->getOperand(2); std::optional<ValueAndVReg> MaybeOffset = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); if (!MaybeOffset) - return {Root, 0}; - return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()}; + return {Root, 0, false}; + bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds); + return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(), + IsInBounds}; } static void addZeroImm(MachineInstrBuilder &MIB) { @@ -6358,7 +6374,8 @@ AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { Register PtrBase; int64_t Offset; - std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); + std::tie(PtrBase, Offset, std::ignore) = + getPtrBaseWithConstantOffset(Src, *MRI); if (isUInt<32>(Offset)) { Data.N0 = PtrBase; Data.Offset = Offset; @@ -6757,6 +6774,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { switch (IntrID) { default: llvm_unreachable("not a named barrier op"); + case Intrinsic::amdgcn_s_barrier_join: + return AMDGPU::S_BARRIER_JOIN_IMM; case Intrinsic::amdgcn_s_get_named_barrier_state: return AMDGPU::S_GET_BARRIER_STATE_IMM; }; @@ -6764,6 +6783,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { switch (IntrID) { default: llvm_unreachable("not a named barrier op"); + case Intrinsic::amdgcn_s_barrier_join: + return AMDGPU::S_BARRIER_JOIN_M0; case Intrinsic::amdgcn_s_get_named_barrier_state: return AMDGPU::S_GET_BARRIER_STATE_M0; }; @@ -6814,8 +6835,11 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit( BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4); constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); + unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init + ? AMDGPU::S_BARRIER_INIT_M0 + : AMDGPU::S_BARRIER_SIGNAL_M0; MachineInstrBuilder MIB; - MIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_M0)); + MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); I.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index c9da419..0924396 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -156,6 +156,7 @@ private: bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const; bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const; bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const; + bool selectSBarrierLeave(MachineInstr &I) const; std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src, bool IsCanonicalizing = true, @@ -295,7 +296,7 @@ private: InstructionSelector::ComplexRendererFns selectDSReadWrite2(MachineOperand &Root, unsigned size) const; - std::pair<Register, int64_t> + std::tuple<Register, int64_t, bool> getPtrBaseWithConstantOffset(Register Root, const MachineRegisterInfo &MRI) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 523c66c..56113e6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -545,7 +545,8 @@ public: AU.addRequired<TargetPassConfig>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<UniformityInfoWrapperPass>(); - AU.setPreservesAll(); + // Invalidates UniformityInfo + AU.setPreservesCFG(); } bool runOnFunction(Function &F) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 40d960e..600a130 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" @@ -137,6 +138,14 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { }; } +// Retrieves the scalar type that's the same size as the mem desc +static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); + return std::make_pair(TypeIdx, LLT::scalar(MemSize)); + }; +} + // Increase the number of vector elements to reach the next legal RegClass. static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { @@ -384,6 +393,16 @@ static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { }; } +// If we have a truncating store or an extending load with a data size larger +// than 32-bits and mem location is a power of 2 +static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); + return isWideScalarExtLoadTruncStore(TypeIdx)(Query) && + isPowerOf2_64(MemSize); + }; +} + // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we // handle some operations by just promoting the register during // selection. There are also d16 loads on GFX9+ which preserve the high bits. @@ -1635,11 +1654,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // May need relegalization for the scalars. return std::pair(0, EltTy); }) - .minScalar(0, S32) - .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) - .widenScalarToNextPow2(0) - .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) - .lower(); + .minScalar(0, S32) + .narrowScalarIf(isTruncStoreToSizePowerOf2(0), + getScalarTypeFromMemDesc(0)) + .widenScalarToNextPow2(0) + .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) + .lower(); } // FIXME: Unaligned accesses not lowered. @@ -5653,7 +5673,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, unsigned SplitSize = 32; if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) && ST.hasDPALU_DPP() && - AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm())) + AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm())) SplitSize = 64; if (Size == SplitSize) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 304e91e..139cad6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -599,8 +599,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitStoreInst(StoreInst &SI) { IRB.SetInsertPoint(&SI); Value *IntV = fatPtrsToInts(V, Ty, IntTy, V->getName()); - for (auto *Dbg : at::getAssignmentMarkers(&SI)) - Dbg->setValue(IntV); + for (auto *Dbg : at::getDVRAssignmentMarkers(&SI)) + Dbg->setRawLocation(ValueAsMetadata::get(IntV)); SI.setOperand(0, IntV); return true; @@ -1361,6 +1361,7 @@ public: PtrParts visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI); PtrParts visitGetElementPtrInst(GetElementPtrInst &GEP); + PtrParts visitPtrToAddrInst(PtrToAddrInst &PA); PtrParts visitPtrToIntInst(PtrToIntInst &PI); PtrParts visitIntToPtrInst(IntToPtrInst &IP); PtrParts visitAddrSpaceCastInst(AddrSpaceCastInst &I); @@ -1954,6 +1955,21 @@ PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) { return {nullptr, nullptr}; } +PtrParts SplitPtrStructs::visitPtrToAddrInst(PtrToAddrInst &PA) { + Value *Ptr = PA.getPointerOperand(); + if (!isSplitFatPtr(Ptr->getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&PA); + + auto [Rsrc, Off] = getPtrParts(Ptr); + Value *Res = IRB.CreateIntCast(Off, PA.getType(), /*isSigned=*/false); + copyMetadata(Res, &PA); + Res->takeName(&PA); + SplitUsers.insert(&PA); + PA.replaceAllUsesWith(Res); + return {nullptr, nullptr}; +} + PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) { if (!isSplitFatPtr(IP.getType())) return {nullptr, nullptr}; @@ -2350,8 +2366,12 @@ static bool containsBufferFatPointers(const Function &F, BufferFatPtrToStructTypeMap *TypeMap) { bool HasFatPointers = false; for (const BasicBlock &BB : F) - for (const Instruction &I : BB) + for (const Instruction &I : BB) { HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType())); + // Catch null pointer constants in loads, stores, etc. + for (const Value *V : I.operand_values()) + HasFatPointers |= (V->getType() != TypeMap->remapType(V->getType())); + } return HasFatPointers; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp index 6390853..6b3cdf5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp @@ -39,6 +39,8 @@ MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK, return GOCS(".num_agpr"); case RIK_NumSGPR: return GOCS(".numbered_sgpr"); + case RIK_NumNamedBarrier: + return GOCS(".num_named_barrier"); case RIK_PrivateSegSize: return GOCS(".private_seg_size"); case RIK_UsesVCC: @@ -66,6 +68,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) { MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext); MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext); MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext); + MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext); auto assignMaxRegSym = [&OutContext](MCSymbol *Sym, int32_t RegCount) { const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext); @@ -75,6 +78,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) { assignMaxRegSym(MaxVGPRSym, MaxVGPR); assignMaxRegSym(MaxAGPRSym, MaxAGPR); assignMaxRegSym(MaxSGPRSym, MaxSGPR); + assignMaxRegSym(MaxNamedBarrierSym, MaxNamedBarrier); } void MCResourceInfo::reset() { *this = MCResourceInfo(); } @@ -97,6 +101,10 @@ MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) { return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr"); } +MCSymbol *MCResourceInfo::getMaxNamedBarrierSymbol(MCContext &OutContext) { + return OutContext.getOrCreateSymbol("amdgpu.max_num_named_barrier"); +} + // Tries to flatten recursive call register resource gathering. Simple cycle // avoiding dfs to find the constants in the propagated symbols. // Assumes: @@ -227,6 +235,10 @@ void MCResourceInfo::assignResourceInfoExpr( case RIK_NumAGPR: ArgExprs.push_back(flattenedCycleMax(CalleeValSym, RIK, OutContext)); break; + case RIK_NumNamedBarrier: + ArgExprs.push_back(MCSymbolRefExpr::create( + getMaxNamedBarrierSymbol(OutContext), OutContext)); + break; } } } @@ -245,11 +257,13 @@ void MCResourceInfo::gatherResourceInfo( MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext); MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext); bool IsLocal = MF.getFunction().hasLocalLinkage(); + MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext); if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) { addMaxVGPRCandidate(FRI.NumVGPR); addMaxAGPRCandidate(FRI.NumAGPR); addMaxSGPRCandidate(FRI.NumExplicitSGPR); + addMaxNamedBarrierCandidate(FRI.NumNamedBarrier); } const TargetMachine &TM = MF.getTarget(); @@ -288,6 +302,7 @@ void MCResourceInfo::gatherResourceInfo( SetMaxReg(MaxVGPRSym, FRI.NumVGPR, RIK_NumVGPR); SetMaxReg(MaxAGPRSym, FRI.NumAGPR, RIK_NumAGPR); SetMaxReg(MaxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR); + SetMaxReg(MaxNamedBarrierSym, FRI.NumNamedBarrier, RIK_NumNamedBarrier); { // The expression for private segment size should be: FRI.PrivateSegmentSize diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h index 297e93b..b605516 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h @@ -31,6 +31,7 @@ public: RIK_NumVGPR, RIK_NumAGPR, RIK_NumSGPR, + RIK_NumNamedBarrier, RIK_PrivateSegSize, RIK_UsesVCC, RIK_UsesFlatScratch, @@ -43,6 +44,7 @@ private: int32_t MaxVGPR = 0; int32_t MaxAGPR = 0; int32_t MaxSGPR = 0; + int32_t MaxNamedBarrier = 0; // Whether the MCResourceInfo has been finalized through finalize(MCContext // &). Should only be called once, at the end of AsmPrinting to assign MaxXGPR @@ -75,6 +77,9 @@ public: void addMaxSGPRCandidate(int32_t candidate) { MaxSGPR = std::max(MaxSGPR, candidate); } + void addMaxNamedBarrierCandidate(int32_t candidate) { + MaxNamedBarrier = std::max(MaxNamedBarrier, candidate); + } MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK, MCContext &OutContext, bool IsLocal); @@ -90,6 +95,7 @@ public: MCSymbol *getMaxVGPRSymbol(MCContext &OutContext); MCSymbol *getMaxAGPRSymbol(MCContext &OutContext); MCSymbol *getMaxSGPRSymbol(MCContext &OutContext); + MCSymbol *getMaxNamedBarrierSymbol(MCContext &OutContext); /// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function /// granularity. However, some resource info has to be assigned the call diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 0c82cace..664a15c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -107,6 +107,8 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, if (!BarAddr) llvm_unreachable("named barrier should have an assigned address"); Entry.first->second = BarAddr.value(); + unsigned BarCnt = DL.getTypeAllocSize(GV.getValueType()) / 16; + recordNumNamedBarriers(BarAddr.value(), BarCnt); return BarAddr.value(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index b1022e4..fc64e16 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -49,6 +49,8 @@ protected: // Flag to check dynamic LDS usage by kernel. bool UsesDynamicLDS = false; + uint32_t NumNamedBarriers = 0; + // Kernels + shaders. i.e. functions called by the hardware and not called // by other functions. bool IsEntryFunction = false; @@ -86,6 +88,12 @@ public: return GDSSize; } + void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt) { + NumNamedBarriers = + std::max(NumNamedBarriers, ((GVAddr & 0x1ff) >> 4) + BarCnt - 1); + } + uint32_t getNumNamedBarriers() const { return NumNamedBarriers; } + bool isEntryFunction() const { return IsEntryFunction; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index aa72c3e..dfe7c53 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -352,7 +352,10 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { case Intrinsic::amdgcn_s_barrier_signal: case Intrinsic::amdgcn_s_barrier_signal_var: case Intrinsic::amdgcn_s_barrier_signal_isfirst: + case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_barrier_wait: + case Intrinsic::amdgcn_s_barrier_leave: case Intrinsic::amdgcn_s_get_barrier_state: case Intrinsic::amdgcn_wave_barrier: case Intrinsic::amdgcn_sched_barrier: @@ -381,7 +384,7 @@ bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, AAResults *AA) { MemorySSAWalker *Walker = MSSA->getWalker(); SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)}; - SmallSet<MemoryAccess *, 8> Visited; + SmallPtrSet<MemoryAccess *, 8> Visited; MemoryLocation Loc(MemoryLocation::get(Load)); LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index b6c6d92..6ddfa38 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -29,7 +29,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass( MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) -MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index 3a37518..28d5400 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -134,8 +134,8 @@ static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType( bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); - SmallSet<const Value *, 32> WorkSet; - SmallSet<const Value *, 32> Visited; + SmallPtrSet<const Value *, 32> WorkSet; + SmallPtrSet<const Value *, 32> Visited; if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) { if (isGlobalAddr(MO)) WorkSet.insert(MO); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp index 4009451..90c4f4e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp @@ -109,7 +109,7 @@ AMDGPUPreloadKernArgProlog::AMDGPUPreloadKernArgProlog(MachineFunction &MF) TRI(*ST.getRegisterInfo()) {} bool AMDGPUPreloadKernArgProlog::run() { - if (!ST.hasKernargPreload()) + if (!ST.needsKernArgPreloadProlog()) return false; unsigned NumKernArgPreloadSGPRs = MFI.getNumKernargPreloadedSGPRs(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp index 984c1ee..a386fe6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp @@ -37,6 +37,11 @@ static cl::opt<unsigned> KernargPreloadCount( "amdgpu-kernarg-preload-count", cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0)); +static cl::opt<bool> + EnableKernargPreload("amdgpu-kernarg-preload", + cl::desc("Enable preload kernel arguments to SGPRs"), + cl::init(true)); + namespace { class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass { @@ -275,6 +280,9 @@ AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy( : ModulePass(ID), TM(TM) {} static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) { + if (!EnableKernargPreload) + return false; + SmallVector<Function *, 4> FunctionsToErase; bool Changed = false; for (auto &F : M) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 5a6ad40..8c56c21 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -724,10 +724,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}}); addRulesForGOpcs({G_PTR_ADD}) - .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}}) - .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}}) - .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}}) - .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}}); + .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}}) + .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}}) + .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}}) + .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}}); addRulesForGOpcs({G_INTTOPTR}) .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}}) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 868b1a2..2379296 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3342,6 +3342,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl( assert(OpdMapper.getVRegs(1).empty()); constrainOpWithReadfirstlane(B, MI, 1); return; + case Intrinsic::amdgcn_s_barrier_join: + constrainOpWithReadfirstlane(B, MI, 1); + return; + case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: constrainOpWithReadfirstlane(B, MI, 1); constrainOpWithReadfirstlane(B, MI, 2); @@ -5515,6 +5519,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_s_sleep_var: OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); break; + case Intrinsic::amdgcn_s_barrier_join: + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + break; + case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp index e2e5c57..d2ec7dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp @@ -195,13 +195,17 @@ bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) { // Delete FeatureWavefrontSize32 functions for // gfx9 and below targets that don't support the mode. - // gfx10+ is implied to support both wave32 and 64 features. + // gfx10, gfx11, gfx12 are implied to support both wave32 and 64 features. // They are not in the feature set. So, we need a separate check - if (ST->getGeneration() < AMDGPUSubtarget::GFX10 && - ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) { + if (!ST->supportsWave32() && ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) { reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize32); return true; } + // gfx125x only support FeatureWavefrontSize32. + if (!ST->supportsWave64() && ST->hasFeature(AMDGPU::FeatureWavefrontSize64)) { + reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize64); + return true; + } return false; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 8101c68..0ea9add 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -142,6 +142,8 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( MRI.isLiveIn(MFI->getPreloadedReg( AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); + Info.NumNamedBarrier = MFI->getNumNamedBarriers(); + // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat // instructions aren't used to access the scratch buffer. Inline assembly may // need it though. @@ -241,6 +243,9 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( if (!RC || !TRI.isVGPRClass(RC)) continue; + if (MI.isCall() || MI.isMetaInstruction()) + continue; + unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32); unsigned HWReg = TRI.getHWRegIndex(Reg); int MaxUsed = HWReg + Width - 1; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h index acfff96..9ae3bb3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h @@ -35,6 +35,7 @@ public: int32_t NumVGPR = 0; int32_t NumAGPR = 0; int32_t NumExplicitSGPR = 0; + int32_t NumNamedBarrier = 0; uint64_t CalleeSegmentSize = 0; uint64_t PrivateSegmentSize = 0; bool UsesVCC = false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index f580f43..20b5fd9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -57,27 +57,47 @@ public: TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM), LIS(LIS) {} + // TODO: Remove this restriction + bool mfmaHasSameSrc2AndDstReg(const MachineInstr &MI) const { + const MachineOperand *Src2 = TII.getNamedOperand(MI, AMDGPU::OpName::src2); + const MachineOperand *Dst = TII.getNamedOperand(MI, AMDGPU::OpName::vdst); + return Src2->getReg() == Dst->getReg() && + Src2->getSubReg() == Dst->getSubReg(); + } + + bool isRewriteCandidate(const MachineInstr &MI) const { + return TII.isMAI(MI) && + AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1 && + mfmaHasSameSrc2AndDstReg(MI); + } + /// Compute the register class constraints based on the uses of \p Reg, - /// excluding uses from \p ExceptMI. This should be nearly identical to + /// excluding MFMA uses from which can be rewritten to change the register + /// class constraint. This should be nearly identical to /// MachineRegisterInfo::recomputeRegClass. const TargetRegisterClass * - recomputeRegClassExcept(Register Reg, const TargetRegisterClass *OldRC, - const TargetRegisterClass *NewRC, - const MachineInstr *ExceptMI) const; + recomputeRegClassExceptRewritable(Register Reg, + const TargetRegisterClass *OldRC, + const TargetRegisterClass *NewRC) const; bool run(MachineFunction &MF) const; }; const TargetRegisterClass * -AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExcept( +AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( Register Reg, const TargetRegisterClass *OldRC, - const TargetRegisterClass *NewRC, const MachineInstr *ExceptMI) const { + const TargetRegisterClass *NewRC) const { // Accumulate constraints from all uses. for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) { // Apply the effect of the given operand to NewRC. MachineInstr *MI = MO.getParent(); - if (MI == ExceptMI) + + // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the + // effects of rewrite candidates. It just so happens that we can use either + // AGPR or VGPR in src0/src1, so don't bother checking the constraint + // effects of the individual operands. + if (isRewriteCandidate(*MI)) continue; unsigned OpNo = &MO - &MI->getOperand(0); @@ -96,8 +116,10 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { return false; // Early exit if no AGPRs were assigned. - if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) + if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) { + LLVM_DEBUG(dbgs() << "skipping function that did not allocate AGPRs\n"); return false; + } bool MadeChange = false; @@ -109,17 +131,25 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // Find AV_* registers assigned to AGPRs. const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg); - if (!TRI.isVectorSuperClass(VirtRegRC)) + if (!TRI.hasAGPRs(VirtRegRC)) continue; - const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg); - if (!TRI.isAGPRClass(AssignedRC)) - continue; + const TargetRegisterClass *AssignedRC = VirtRegRC; + if (TRI.hasVGPRs(VirtRegRC)) { + // If this is an AV register, we have to check if the actual assignment is + // to an AGPR + AssignedRC = TRI.getPhysRegBaseClass(PhysReg); + if (!TRI.isAGPRClass(AssignedRC)) + continue; + } LiveInterval &LI = LIS.getInterval(VReg); // TODO: Test multiple uses for (VNInfo *VNI : LI.vnis()) { + if (VNI->isPHIDef() || VNI->isUnused()) + continue; + MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def); // TODO: Handle SplitKit produced copy bundles for partially defined @@ -183,10 +213,13 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // first place, as well as need to assign another register, and need to // figure out where to put them. The live range splitting is smarter than // anything we're doing here, so trust it did something reasonable. - const TargetRegisterClass *Src2ExceptRC = recomputeRegClassExcept( - Src2->getReg(), Src2VirtRegRC, VirtRegRC, CopySrcMI); - if (!Src2ExceptRC) + const TargetRegisterClass *Src2ExceptRC = + recomputeRegClassExceptRewritable(Src2->getReg(), Src2VirtRegRC, + VirtRegRC); + if (!Src2ExceptRC) { + LLVM_DEBUG(dbgs() << "Could not recompute the regclass\n"); continue; + } const TargetRegisterClass *NewSrc2ConstraintRC = TII.getRegClass(TII.get(AGPROp), Src2->getOperandNo(), &TRI, MF); @@ -196,8 +229,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { const TargetRegisterClass *NewSrc2RC = TRI.getCommonSubClass(Src2ExceptRC, NewSrc2ConstraintRC); if (!NewSrc2RC) { - // TODO: This is ignoring ther rewritable uses. e.g. a rewritable MFMA - // using a rewritable MFMA can be rewritten as a pair. LLVM_DEBUG(dbgs() << "Other uses of " << printReg(Src2->getReg(), &TRI) << " are incompatible with replacement class\n"); continue; @@ -208,8 +239,19 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { CopySrcMI->setDesc(TII.get(AGPROp)); - // TODO: Is replacing too aggressive, fixup these instructions only? - MRI.replaceRegWith(CopySrcReg, VReg); + // Perform replacement of the register, rewriting the rewritable uses. + for (MachineInstr &UseMI : + make_early_inc_range(MRI.reg_instructions(CopySrcReg))) { + if (TII.isMAI(UseMI)) { + // Note the register we need to rewrite may still appear in src0/src1, + // but that's fine since those can use A or V anyway. + int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(UseMI.getOpcode()); + if (ReplacementOp != -1) + UseMI.setDesc(TII.get(ReplacementOp)); + } + + UseMI.substituteRegister(CopySrcReg, VReg, AMDGPU::NoSubRegister, TRI); + } LLVM_DEBUG(dbgs() << "Replaced VGPR MFMA with AGPR: " << *CopySrcMI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 10b8606..7be1899 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -378,6 +378,7 @@ foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence<intr>; def : SourceOfDivergence<int_amdgcn_dead>; +def : SourceOfDivergence<int_amdgcn_call_whole_wave>; class AlwaysUniform<Intrinsic intr> { Intrinsic Intr = intr; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp index b60ded3..56aa3f6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp @@ -195,7 +195,7 @@ bool AMDGPUSetWavePriority::run(MachineFunction &MF) { // Lower the priority on edges where control leaves blocks from which // the VMEM loads are reachable. - SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks; + SmallPtrSet<MachineBasicBlock *, 16> PriorityLoweringBlocks; for (MachineBasicBlock &MBB : MF) { if (MBBInfos[&MBB].MayReachVMEMLoad) { if (MBB.succ_empty()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c1f1703..e393aa19 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -848,8 +848,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (Level == OptimizationLevel::O0) return; - PM.addPass(AMDGPUUnifyMetadataPass()); - // We don't want to run internalization at per-module stage. if (InternalizeSymbols && !isLTOPreLink(Phase)) { PM.addPass(InternalizePass(mustPreserveGV)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp deleted file mode 100644 index e400491..0000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp +++ /dev/null @@ -1,119 +0,0 @@ -//===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// \file -// This pass that unifies multiple OpenCL metadata due to linking. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/PassManager.h" -#include "llvm/Pass.h" - -using namespace llvm; - -namespace { - - namespace kOCLMD { - - const char SpirVer[] = "opencl.spir.version"; - const char OCLVer[] = "opencl.ocl.version"; - const char UsedExt[] = "opencl.used.extensions"; - const char UsedOptCoreFeat[] = "opencl.used.optional.core.features"; - const char CompilerOptions[] = "opencl.compiler.options"; - const char LLVMIdent[] = "llvm.ident"; - - } // end namespace kOCLMD - - /// Unify version metadata. - /// \return true if changes are made. - /// Assume the named metadata has operands each of which is a pair of - /// integer constant, e.g. - /// !Name = {!n1, !n2} - /// !n1 = {i32 1, i32 2} - /// !n2 = {i32 2, i32 0} - /// Keep the largest version as the sole operand if PickFirst is false. - /// Otherwise pick it from the first value, representing kernel module. - bool unifyVersionMD(Module &M, StringRef Name, bool PickFirst) { - auto *NamedMD = M.getNamedMetadata(Name); - if (!NamedMD || NamedMD->getNumOperands() <= 1) - return false; - MDNode *MaxMD = nullptr; - auto MaxVer = 0U; - for (auto *VersionMD : NamedMD->operands()) { - assert(VersionMD->getNumOperands() == 2); - auto *CMajor = mdconst::extract<ConstantInt>(VersionMD->getOperand(0)); - auto VersionMajor = CMajor->getZExtValue(); - auto *CMinor = mdconst::extract<ConstantInt>(VersionMD->getOperand(1)); - auto VersionMinor = CMinor->getZExtValue(); - auto Ver = (VersionMajor * 100) + (VersionMinor * 10); - if (Ver > MaxVer) { - MaxVer = Ver; - MaxMD = VersionMD; - } - if (PickFirst) - break; - } - NamedMD->eraseFromParent(); - NamedMD = M.getOrInsertNamedMetadata(Name); - NamedMD->addOperand(MaxMD); - return true; - } - - /// Unify version metadata. - /// \return true if changes are made. - /// Assume the named metadata has operands each of which is a list e.g. - /// !Name = {!n1, !n2} - /// !n1 = !{!"cl_khr_fp16", {!"cl_khr_fp64"}} - /// !n2 = !{!"cl_khr_image"} - /// Combine it into a single list with unique operands. - bool unifyExtensionMD(Module &M, StringRef Name) { - auto *NamedMD = M.getNamedMetadata(Name); - if (!NamedMD || NamedMD->getNumOperands() == 1) - return false; - - SmallVector<Metadata *, 4> All; - for (auto *MD : NamedMD->operands()) - for (const auto &Op : MD->operands()) - if (!llvm::is_contained(All, Op.get())) - All.push_back(Op.get()); - - NamedMD->eraseFromParent(); - NamedMD = M.getOrInsertNamedMetadata(Name); - for (const auto &MD : All) - NamedMD->addOperand(MDNode::get(M.getContext(), MD)); - - return true; - } - - /// Unify multiple OpenCL metadata due to linking. - bool unifyMetadataImpl(Module &M) { - const char *Vers[] = {kOCLMD::SpirVer, kOCLMD::OCLVer}; - const char *Exts[] = {kOCLMD::UsedExt, kOCLMD::UsedOptCoreFeat, - kOCLMD::CompilerOptions, kOCLMD::LLVMIdent}; - - bool Changed = false; - - for (auto &I : Vers) - Changed |= unifyVersionMD(M, I, true); - - for (auto &I : Exts) - Changed |= unifyExtensionMD(M, I); - - return Changed; - } - - } // end anonymous namespace - - PreservedAnalyses AMDGPUUnifyMetadataPass::run(Module &M, - ModuleAnalysisManager &AM) { - return unifyMetadataImpl(M) ? PreservedAnalyses::none() - : PreservedAnalyses::all(); - } diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 0d2feeb..9514732 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -5052,11 +5052,13 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst, if (DppCtrlIdx >= 0) { unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm(); - if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) && - AMDGPU::isDPALU_DPP(MII.get(Opc))) { - // DP ALU DPP is supported for row_newbcast only on GFX9* + if (!AMDGPU::isLegalDPALU_DPPControl(getSTI(), DppCtrl) && + AMDGPU::isDPALU_DPP(MII.get(Opc), getSTI())) { + // DP ALU DPP is supported for row_newbcast only on GFX9* and row_share + // only on GFX12. SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands); - Error(S, "DP ALU dpp only supports row_newbcast"); + Error(S, isGFX12() ? "DP ALU dpp only supports row_share" + : "DP ALU dpp only supports row_newbcast"); return false; } } @@ -6268,8 +6270,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ExprVal, ValRange); } else if (ID == ".amdhsa_workgroup_processor_mode") { - if (IVersion.Major < 10) - return Error(IDRange.Start, "directive requires gfx10+", IDRange); + if (!supportsWGP(getSTI())) + return Error(IDRange.Start, + "directive unsupported on " + getSTI().getCPU(), IDRange); PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ExprVal, ValRange); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index c466f9c..dc9dd22 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -114,7 +114,6 @@ add_llvm_target(AMDGPUCodeGen AMDGPUTargetTransformInfo.cpp AMDGPUWaitSGPRHazards.cpp AMDGPUUnifyDivergentExitNodes.cpp - AMDGPUUnifyMetadata.cpp R600MachineCFGStructurizer.cpp GCNCreateVOPD.cpp GCNDPPCombine.cpp diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index fb7d634..070de00 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -2422,8 +2422,18 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3( "must be zero on gfx10 or gfx11"); } - // Bits [14-30]. - CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4, + // Bits [14-16] + if (isGFX1250()) { + PRINT_DIRECTIVE(".amdhsa_named_barrier_count", + COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT); + } else { + CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_GFX120_RESERVED4, + "COMPUTE_PGM_RSRC3", + "must be zero on gfx10+"); + } + + // Bits [17-30]. + CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED5, "COMPUTE_PGM_RSRC3", "must be zero on gfx10+"); // Bits [31]. diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index d5d1074..f5d4384 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1274,7 +1274,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in { defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>; } -let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus in { +let WaveSizePredicate = isWave32, SubtargetPredicate = HasTransposeLoadF4F6Insts in { defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VReg_96>; defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VReg_64>; } diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index f9a907a..184929a 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -421,6 +421,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) { DPPInst.addImm(ByteSelOpr->getImm()); } + if (MachineOperand *BitOp3 = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::bitop3)) { + assert(AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::bitop3)); + DPPInst.add(*BitOp3); + } } DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); @@ -544,11 +549,17 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { return false; } - if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || - MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { - auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl); - assert(DppCtrl && DppCtrl->isImm()); - if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl->getImm())) { + auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl); + assert(DppCtrl && DppCtrl->isImm()); + unsigned DppCtrlVal = DppCtrl->getImm(); + if ((MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || + MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp)) { + if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP)) { + LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move is unsupported\n"); + // Split it. + return false; + } + if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal)) { LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported" " control value\n"); // Let it split, then control may become legal. @@ -704,6 +715,20 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { break; } + if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP) && + AMDGPU::isDPALU_DPP32BitOpc(OrigOp)) { + LLVM_DEBUG(dbgs() << " " << OrigMI + << " failed: DPP ALU DPP is not supported\n"); + break; + } + + if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal) && + AMDGPU::isDPALU_DPP(TII->get(OrigOp), *ST)) { + LLVM_DEBUG(dbgs() << " " << OrigMI + << " failed: not valid 64-bit DPP control value\n"); + break; + } + LLVM_DEBUG(dbgs() << " combining: " << OrigMI); if (Use == Src0) { if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 96cb5ae..a3b64ae 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1200,6 +1200,14 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); fixRequiredExportPriority(MI); + if (ST.requiresWaitIdleBeforeGetReg()) + fixGetRegWaitIdle(MI); + if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug()) + fixDsAtomicAsyncBarrierArriveB64(MI); + if (ST.hasScratchBaseForwardingHazard()) + fixScratchBaseForwardingHazard(MI); + if (ST.setRegModeNeedsVNOPs()) + fixSetRegMode(MI); } static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, @@ -1350,6 +1358,9 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { return (Decoded.DsCnt == 0); } default: + assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) || + MI.getOpcode() == AMDGPU::S_WAIT_IDLE) && + "unexpected wait count instruction"); // SOPP instructions cannot mitigate the hazard. if (TII->isSOPP(MI)) return false; @@ -1731,7 +1742,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(0x0fff); + .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); return true; } @@ -1781,7 +1792,7 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - I.getOperand(0).getImm() == 0x0fff)) + AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0)) return HazardExpired; // Track registers writes @@ -2239,19 +2250,7 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) return true; - switch (MI.getOpcode()) { - case AMDGPU::S_WAITCNT: - case AMDGPU::S_WAITCNT_VSCNT: - case AMDGPU::S_WAITCNT_VMCNT: - case AMDGPU::S_WAITCNT_EXPCNT: - case AMDGPU::S_WAITCNT_LGKMCNT: - case AMDGPU::S_WAIT_IDLE: - return true; - default: - break; - } - - return false; + return SIInstrInfo::isWaitcnt(MI.getOpcode()); }; return FPAtomicToDenormModeWaitStates - @@ -3428,3 +3427,125 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { return true; } + +bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) { + if (!isSGetReg(MI->getOpcode())) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + switch (getHWReg(TII, *MI)) { + default: + return false; + case AMDGPU::Hwreg::ID_STATUS: + case AMDGPU::Hwreg::ID_STATE_PRIV: + case AMDGPU::Hwreg::ID_EXCP_FLAG_PRIV: + case AMDGPU::Hwreg::ID_EXCP_FLAG_USER: + break; + } + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0); + return true; +} + +bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) { + if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0xFFE3); + BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0xFFE3); + + return true; +} + +bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) { + // No reason to check this in pre-RA scheduling, SGPRs have to be allocated + // for hazard to trigger. + if (!IsHazardRecognizerMode) + return false; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU. + const int FlatScrBaseWaitStates = 10; + + bool ReadsFlatScrLo = + MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI); + bool ReadsFlatScrHi = + MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI); + if (isSGetReg(MI->getOpcode())) { + switch (getHWReg(TII, *MI)) { + default: + break; + case AMDGPU::Hwreg::ID_FLAT_SCR_LO: + ReadsFlatScrLo = true; + break; + case AMDGPU::Hwreg::ID_FLAT_SCR_HI: + ReadsFlatScrHi = true; + break; + } + } + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + auto IsRegDefHazard = [&](Register Reg) -> bool { + DenseSet<const MachineBasicBlock *> Visited; + auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) { + return MI.modifiesRegister(Reg, TRI); + }; + + // This literally abuses the idea of waitstates. Instead of waitstates it + // returns 1 for SGPR written and 0 otherwise. + auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned { + if (!TII->isSALU(MI) && !TII->isVALU(MI)) + return 0; + for (const MachineOperand &MO : MI.all_defs()) { + if (TRI->isSGPRReg(MRI, MO.getReg())) + return 1; + } + return 0; + }; + + auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) { + if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) { + unsigned Wait = MI.getOperand(0).getImm(); + if (AMDGPU::DepCtr::decodeFieldSaSdst(Wait) == 0 && + AMDGPU::DepCtr::decodeFieldVaSdst(Wait) == 0) + return true; + } + return SgprWrites >= FlatScrBaseWaitStates; + }; + + return ::getWaitStatesSince( + IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()), + 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates; + }; + + if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) || + !IsRegDefHazard(AMDGPU::SGPR102)) && + (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) || + !IsRegDefHazard(AMDGPU::SGPR103))) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(AMDGPU::DepCtr::encodeFieldVaSdst( + AMDGPU::DepCtr::encodeFieldSaSdst(0), 0)); + return true; +} + +bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) { + if (!isSSetReg(MI->getOpcode()) || + MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32)); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32)); + return true; +} diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index f796eeae..67beffa 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -110,6 +110,10 @@ private: bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); bool fixRequiredExportPriority(MachineInstr *MI); + bool fixGetRegWaitIdle(MachineInstr *MI); + bool fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI); + bool fixScratchBaseForwardingHazard(MachineInstr *MI); + bool fixSetRegMode(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); int checkMAIHazards908(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 334afd3..ef63acc 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -368,46 +368,45 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, //////////////////////////////////////////////////////////////////////////////// // GCNRPTarget -GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { +GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { const Function &F = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF); + setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F)); } GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, - const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { - setRegLimits(NumSGPRs, NumVGPRs, MF); + const MachineFunction &MF, const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { + setTarget(NumSGPRs, NumVGPRs); } GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { + const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned DynamicVGPRBlockSize = MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); - setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false), - ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF); + setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false), + ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize)); } -void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs, - const MachineFunction &MF) { +void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - unsigned DynamicVGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs); MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs); - MaxUnifiedVGPRs = - ST.hasGFX90AInsts() - ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs) - : 0; + if (UnifiedRF) { + unsigned DynamicVGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + MaxUnifiedVGPRs = + std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs); + } else { + MaxUnifiedVGPRs = 0; + } } -bool GCNRPTarget::isSaveBeneficial(Register Reg, - const MachineRegisterInfo &MRI) const { +bool GCNRPTarget::isSaveBeneficial(Register Reg) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterClass *RC = MRI.getRegClass(Reg); const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); @@ -416,16 +415,19 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg, return RP.getSGPRNum() > MaxSGPRs; unsigned NumVGPRs = SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum(); - return isVGPRBankSaveBeneficial(NumVGPRs); + // The addressable limit must always be respected. + if (NumVGPRs > MaxVGPRs) + return true; + // For unified RFs, combined VGPR usage limit must be respected as well. + return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs; } bool GCNRPTarget::satisfied() const { - if (RP.getSGPRNum() > MaxSGPRs) + if (RP.getSGPRNum() > MaxSGPRs || RP.getVGPRNum(false) > MaxVGPRs) return false; - if (RP.getVGPRNum(false) > MaxVGPRs && - (!CombineVGPRSavings || !satisifiesVGPRBanksTarget())) + if (UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs) return false; - return satisfiesUnifiedTarget(); + return true; } /////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index ea33a22..a9c58bb 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -186,20 +186,22 @@ public: /// Sets up the target such that the register pressure starting at \p RP does /// not show register spilling on function \p MF (w.r.t. the function's /// mininum target occupancy). - GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings = false); + GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP); /// Sets up the target such that the register pressure starting at \p RP does /// not use more than \p NumSGPRs SGPRs and \p NumVGPRs VGPRs on function \p /// MF. GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings = false); + const GCNRegPressure &RP); /// Sets up the target such that the register pressure starting at \p RP does /// not prevent achieving an occupancy of at least \p Occupancy on function /// \p MF. GCNRPTarget(unsigned Occupancy, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings = false); + const GCNRegPressure &RP); + + /// Changes the target (same semantics as constructor). + void setTarget(unsigned NumSGPRs, unsigned NumVGPRs); const GCNRegPressure &getCurrentRP() const { return RP; } @@ -207,7 +209,7 @@ public: /// Determines whether saving virtual register \p Reg will be beneficial /// towards achieving the RP target. - bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const; + bool isSaveBeneficial(Register Reg) const; /// Saves virtual register \p Reg with lanemask \p Mask. void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) { @@ -227,15 +229,15 @@ public: if (Target.MaxUnifiedVGPRs) { OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs << " VGPRs (unified)"; - } else if (Target.CombineVGPRSavings) { - OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/' - << 2 * Target.MaxVGPRs << " VGPRs (combined target)"; } return OS; } #endif private: + const MachineFunction &MF; + const bool UnifiedRF; + /// Current register pressure. GCNRegPressure RP; @@ -246,29 +248,10 @@ private: /// Target number of overall VGPRs for subtargets with unified RFs. Always 0 /// for subtargets with non-unified RFs. unsigned MaxUnifiedVGPRs; - /// Whether we consider that the register allocator will be able to swap - /// between ArchVGPRs and AGPRs by copying them to a super register class. - /// Concretely, this allows savings in one of the VGPR banks to help toward - /// savings in the other VGPR bank. - bool CombineVGPRSavings; - - inline bool satisifiesVGPRBanksTarget() const { - assert(CombineVGPRSavings && "only makes sense with combined savings"); - return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs; - } - - /// Always satisified when the subtarget doesn't have a unified RF. - inline bool satisfiesUnifiedTarget() const { - return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs; - } - - inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const { - return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() || - (CombineVGPRSavings && !satisifiesVGPRBanksTarget()); - } - void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs, - const MachineFunction &MF); + GCNRPTarget(const GCNRegPressure &RP, const MachineFunction &MF) + : MF(MF), UnifiedRF(MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()), + RP(RP) {} }; /////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 96d5668..254b75b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1086,7 +1086,8 @@ bool ClusteredLowOccStage::initGCNSchedStage() { } /// Allows to easily filter for this stage's debug output. -#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;) +#define REMAT_PREFIX "[PreRARemat] " +#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;) bool PreRARematStage::initGCNSchedStage() { // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for @@ -1115,10 +1116,15 @@ bool PreRARematStage::initGCNSchedStage() { rematerialize(); if (GCNTrackers) DAG.RegionLiveOuts.buildLiveRegMap(); - REMAT_DEBUG( - dbgs() << "Retrying function scheduling with new min. occupancy of " - << AchievedOcc << " from rematerializing (original was " - << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n"); + REMAT_DEBUG({ + dbgs() << "Retrying function scheduling with new min. occupancy of " + << AchievedOcc << " from rematerializing (original was " + << DAG.MinOccupancy; + if (TargetOcc) + dbgs() << ", target was " << *TargetOcc; + dbgs() << ")\n"; + }); + if (AchievedOcc > DAG.MinOccupancy) { DAG.MinOccupancy = AchievedOcc; SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); @@ -1540,8 +1546,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { return GCNSchedStage::shouldRevertScheduling(WavesAfter) || - mayCauseSpilling(WavesAfter) || - (IncreaseOccupancy && WavesAfter < TargetOcc); + mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc); } bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { @@ -1687,78 +1692,63 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, } bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { - REMAT_DEBUG({ - dbgs() << "Collecting rematerializable instructions in "; - MF.getFunction().printAsOperand(dbgs(), false); - dbgs() << '\n'; - }); + const Function &F = MF.getFunction(); // Maps optimizable regions (i.e., regions at minimum and register-limited // occupancy, or regions with spilling) to the target RP we would like to // reach. DenseMap<unsigned, GCNRPTarget> OptRegions; - const Function &F = MF.getFunction(); - unsigned DynamicVGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); - - std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F); - const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F); - const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F); - const unsigned MaxSGPRsIncOcc = - ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false); - const unsigned MaxVGPRsIncOcc = - ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize); - IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy; - - // Collect optimizable regions. If there is spilling in any region we will - // just try to reduce spilling. Otherwise we will try to increase occupancy by - // one in the whole function. - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - GCNRegPressure &RP = DAG.Pressure[I]; - // We allow ArchVGPR or AGPR savings to count as savings of the other kind - // of VGPR only when trying to eliminate spilling. We cannot do this when - // trying to increase occupancy since VGPR class swaps only occur later in - // the register allocator i.e., the scheduler will not be able to reason - // about these savings and will not report an increase in the achievable - // occupancy, triggering rollbacks. - GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP, - /*CombineVGPRSavings=*/true); - if (!Target.satisfied() && IncreaseOccupancy) { - // There is spilling in the region and we were so far trying to increase - // occupancy. Strop trying that and focus on reducing spilling. - IncreaseOccupancy = false; - OptRegions.clear(); - } else if (IncreaseOccupancy) { - // There is no spilling in the region, try to increase occupancy. - Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP, - /*CombineVGPRSavings=*/false); + unsigned MaxSGPRs = ST.getMaxNumSGPRs(F); + unsigned MaxVGPRs = ST.getMaxNumVGPRs(F); + auto ResetTargetRegions = [&]() { + OptRegions.clear(); + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + const GCNRegPressure &RP = DAG.Pressure[I]; + GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP); + if (!Target.satisfied()) + OptRegions.insert({I, Target}); } - if (!Target.satisfied()) - OptRegions.insert({I, Target}); - } - if (OptRegions.empty()) - return false; + }; -#ifndef NDEBUG - if (IncreaseOccupancy) { - REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy - << ") in regions:\n"); + ResetTargetRegions(); + if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) { + // In addition to register usage being above addressable limits, occupancy + // below the minimum is considered like "spilling" as well. + TargetOcc = std::nullopt; } else { - REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy (" - << WavesPerEU.first << ") in regions:\n"); - } - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) - REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n'); + // There is no spilling and room to improve occupancy; set up "increased + // occupancy targets" for all regions. + TargetOcc = DAG.MinOccupancy + 1; + unsigned VGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false); + MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize); + ResetTargetRegions(); } -#endif - - // When we are reducing spilling, the target is the minimum target number of - // waves/EU determined by the subtarget. In cases where either one of - // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current - // minimum region occupancy may be higher than the latter. - TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 - : std::max(DAG.MinOccupancy, WavesPerEU.first); + REMAT_DEBUG({ + dbgs() << "Analyzing "; + MF.getFunction().printAsOperand(dbgs(), false); + dbgs() << ": "; + if (OptRegions.empty()) { + dbgs() << "no objective to achieve, occupancy is maximal at " + << MFI.getMaxWavesPerEU(); + } else if (!TargetOcc) { + dbgs() << "reduce spilling (minimum target occupancy is " + << MFI.getMinWavesPerEU() << ')'; + } else { + dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to " + << TargetOcc; + } + dbgs() << '\n'; + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) { + dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond() + << '\n'; + } + } + }); + if (OptRegions.empty()) + return false; // Accounts for a reduction in RP in an optimizable region. Returns whether we // estimate that we have identified enough rematerialization opportunities to @@ -1767,7 +1757,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask, bool &Progress) -> bool { GCNRPTarget &Target = OptIt->getSecond(); - if (!Target.isSaveBeneficial(Reg, DAG.MRI)) + if (!Target.isSaveBeneficial(Reg)) return false; Progress = true; Target.saveReg(Reg, Mask, DAG.MRI); @@ -1876,7 +1866,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { } } - if (IncreaseOccupancy) { + if (TargetOcc) { // We were trying to increase occupancy but failed, abort the stage. REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n"); Rematerializations.clear(); @@ -1979,7 +1969,9 @@ void PreRARematStage::rematerialize() { // All regions impacted by at least one rematerialization must be rescheduled. // Maximum pressure must also be recomputed for all regions where it changed // non-predictably and checked against the target occupancy. - AchievedOcc = TargetOcc; + unsigned DynamicVGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + AchievedOcc = MFI.getMaxWavesPerEU(); for (auto &[I, OriginalRP] : ImpactedRegions) { bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second; RescheduleRegions[I] = !IsEmptyRegion; @@ -2003,9 +1995,8 @@ void PreRARematStage::rematerialize() { } } DAG.Pressure[I] = RP; - AchievedOcc = std::min( - AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>() - ->getDynamicVGPRBlockSize())); + AchievedOcc = + std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize)); } REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); } @@ -2035,7 +2026,7 @@ void PreRARematStage::finalizeGCNSchedStage() { // which case we do not want to rollback either (the rescheduling was already // reverted in PreRARematStage::shouldRevertScheduling in such cases). unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy); - if (!IncreaseOccupancy || MaxOcc >= TargetOcc) + if (!TargetOcc || MaxOcc >= *TargetOcc) return; REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n"); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 32139a9..790370f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -470,15 +470,12 @@ private: /// After successful stage initialization, indicates which regions should be /// rescheduled. BitVector RescheduleRegions; - /// Target occupancy the stage estimates is reachable through - /// rematerialization. Greater than or equal to the pre-stage min occupancy. - unsigned TargetOcc; + /// The target occupancy the stage is trying to achieve. Empty when the + /// objective is spilling reduction. + std::optional<unsigned> TargetOcc; /// Achieved occupancy *only* through rematerializations (pre-rescheduling). /// Smaller than or equal to the target occupancy. unsigned AchievedOcc; - /// Whether the stage is attempting to increase occupancy in the abscence of - /// spilling. - bool IncreaseOccupancy; /// Returns whether remat can reduce spilling or increase function occupancy /// by 1 through rematerialization. If it can do one, collects instructions in diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f47ddf5..2a8385d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -390,7 +390,11 @@ public: /// the original value. bool zeroesHigh16BitsOfDest(unsigned Opcode) const; - bool supportsWGP() const { return getGeneration() >= GFX10; } + bool supportsWGP() const { + if (GFX1250Insts) + return false; + return getGeneration() >= GFX10; + } bool hasIntClamp() const { return HasIntClamp; @@ -1341,6 +1345,10 @@ public: bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; } + bool setRegModeNeedsVNOPs() const { + return GFX1250Insts && getGeneration() == GFX12; + } + /// Return if operations acting on VGPR tuples require even alignment. bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; } @@ -1573,6 +1581,12 @@ public: // extended VA to 57 bits. bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; } + // \returns true if the target needs to create a prolog for backward + // compatibility when preloading kernel arguments. + bool needsKernArgPreloadProlog() const { + return hasKernargPreload() && !GFX1250Insts; + } + /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { return AMDGPU::IsaInfo::getSGPRAllocGranule(this); @@ -1722,6 +1736,10 @@ public: /// unit requirement. unsigned getMaxNumVGPRs(const MachineFunction &MF) const; + bool supportsWave32() const { return getGeneration() >= GFX10; } + + bool supportsWave64() const { return !hasGFX1250Insts(); } + bool isWave32() const { return getWavefrontSize() == 32; } @@ -1785,11 +1803,11 @@ public: // \returns true if the subtarget has a hazard requiring an "s_nop 0" // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)". - bool requiresNopBeforeDeallocVGPRs() const { - // Currently all targets that support the dealloc VGPRs message also require - // the nop. - return true; - } + bool requiresNopBeforeDeallocVGPRs() const { return !GFX1250Insts; } + + // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on + // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER. + bool requiresWaitIdleBeforeGetReg() const { return GFX1250Insts; } bool isDynamicVGPREnabled() const { return DynamicVGPR; } unsigned getDynamicVGPRBlockSize() const { @@ -1801,6 +1819,18 @@ public: // to the same register. return false; } + + // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything + // and surronded by S_WAIT_ALU(0xFFE3). + bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const { + return getGeneration() == GFX12; + } + + // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base + // read. + bool hasScratchBaseForwardingHazard() const { + return GFX1250Insts && getGeneration() == GFX12; + } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index ee8683a..aafbdc2 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -976,8 +976,10 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, unsigned Imm = MI->getOperand(OpNo).getImm(); const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - if (!AMDGPU::isLegalDPALU_DPPControl(Imm) && AMDGPU::isDPALU_DPP(Desc)) { - O << " /* DP ALU dpp only supports row_newbcast */"; + if (!AMDGPU::isLegalDPALU_DPPControl(STI, Imm) && + AMDGPU::isDPALU_DPP(Desc, STI)) { + O << " /* DP ALU dpp only supports " + << (isGFX12(STI) ? "row_share" : "row_newbcast") << " */"; return; } if (Imm <= DppCtrl::QUAD_PERM_LAST) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index f358084..61f6732 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -389,6 +389,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) && // Matrix B format operand reuses op_sel_hi. !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_fmt) && + // Matrix B scale operand reuses op_sel_hi. + !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_scale) && // Matrix B reuse operand reuses op_sel_hi. !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) { Encoding |= getImplicitOpSelHiEncoding(Opcode); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 68302f0..197de12 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -26,7 +26,6 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" #include "llvm/TargetParser/TargetParser.h" @@ -277,10 +276,10 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, void AMDGPUTargetAsmStreamer::EmitMCResourceInfo( const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, - const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize, - const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, - const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, - const MCSymbol *HasIndirectCall) { + const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier, + const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, + const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, + const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) { #define PRINT_RES_INFO(ARG) \ OS << "\t.set "; \ ARG->print(OS, getContext().getAsmInfo()); \ @@ -291,6 +290,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceInfo( PRINT_RES_INFO(NumVGPR); PRINT_RES_INFO(NumAGPR); PRINT_RES_INFO(NumExplicitSGPR); + PRINT_RES_INFO(NumNamedBarrier); PRINT_RES_INFO(PrivateSegmentSize); PRINT_RES_INFO(UsesVCC); PRINT_RES_INFO(UsesFlatScratch); @@ -563,11 +563,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PrintField(KD.compute_pgm_rsrc3, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ".amdhsa_tg_split"); - if (IVersion.Major >= 10) { + if (AMDGPU::supportsWGP(STI)) PrintField(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ".amdhsa_workgroup_processor_mode"); + if (IVersion.Major >= 10) { PrintField(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, @@ -885,7 +886,7 @@ void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, if (!SymbolELF->isBindingSet()) SymbolELF->setBinding(ELF::STB_GLOBAL); - if (SymbolELF->declareCommon(Size, Alignment, true)) { + if (SymbolELF->declareCommon(Size, Alignment)) { report_fatal_error("Symbol: " + Symbol->getName() + " redeclared as different type"); } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 9c49020..22afcde 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -62,10 +62,10 @@ public: virtual void EmitMCResourceInfo( const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, - const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize, - const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, - const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, - const MCSymbol *HasIndirectCall) {}; + const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier, + const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, + const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, + const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) {}; virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, @@ -141,14 +141,12 @@ public: void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override; - void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, - const MCSymbol *NumExplicitSGPR, - const MCSymbol *PrivateSegmentSize, - const MCSymbol *UsesVCC, - const MCSymbol *UsesFlatScratch, - const MCSymbol *HasDynamicallySizedStack, - const MCSymbol *HasRecursion, - const MCSymbol *HasIndirectCall) override; + void EmitMCResourceInfo( + const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, + const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier, + const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, + const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, + const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) override; void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR) override; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 2d0102f..7c01903 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -197,7 +197,7 @@ enum ClassFlags : unsigned { namespace AMDGPU { enum OperandType : unsigned { - /// Operands with register or 32-bit immediate + /// Operands with register, 32-bit, or 64-bit immediate OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET, OPERAND_REG_IMM_INT64, OPERAND_REG_IMM_INT16, @@ -407,7 +407,7 @@ enum CPol { SCAL = 1 << 11, // Scale offset bit - ALL = TH | SCOPE, + ALL = TH | SCOPE | NV, // Helper bits TH_TYPE_LOAD = 1 << 7, // TH_LOAD policy @@ -440,6 +440,7 @@ enum Id { // Message ID, width(4) [3:0]. ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10 ID_GS_ALLOC_REQ = 9, // added in GFX9 ID_GET_DOORBELL = 10, // added in GFX9, removed in GFX11 + ID_SAVEWAVE_HAS_TDM = 10, // added in GFX1250 ID_GET_DDID = 11, // added in GFX10, removed in GFX11 ID_SYSMSG = 15, @@ -513,6 +514,7 @@ enum Id { // HwRegCode, (6) [5:0] ID_HW_ID2 = 24, ID_POPS_PACKER = 25, ID_PERF_SNAPSHOT_DATA_gfx11 = 27, + ID_IB_STS2 = 28, ID_SHADER_CYCLES = 29, ID_SHADER_CYCLES_HI = 30, ID_DVGPR_ALLOC_LO = 31, diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index f018f77..dce4e6f 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -460,7 +460,7 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, // List of clobbering instructions. SmallVector<MachineInstr*, 8> Clobbers; // List of instructions marked for deletion. - SmallSet<MachineInstr*, 8> MergedInstrs; + SmallPtrSet<MachineInstr *, 8> MergedInstrs; bool Changed = false; @@ -808,7 +808,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { bool AllAGPRUses = true; SetVector<const MachineInstr *> worklist; - SmallSet<const MachineInstr *, 4> Visited; + SmallPtrSet<const MachineInstr *, 4> Visited; SetVector<MachineInstr *> PHIOperands; worklist.insert(&MI); Visited.insert(&MI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5b327fb..561019b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments( if (!IsKernel) { CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); CCInfo.AnalyzeFormalArguments(Splits, AssignFn); + + // This assumes the registers are allocated by CCInfo in ascending order + // with no gaps. + Info->setNumWaveDispatchSGPRs( + CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters())); + Info->setNumWaveDispatchVGPRs( + CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters())); + } else if (Info->getNumKernargPreloadedSGPRs()) { + Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs()); } SmallVector<SDValue, 16> Chains; @@ -6612,7 +6621,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, unsigned SplitSize = 32; if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) && ST->hasDPALU_DPP() && - AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3))) + AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3))) SplitSize = 64; auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, @@ -10816,6 +10825,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); + case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: { // these two intrinsics have two operands: barrier pointer and member count SDValue Chain = Op->getOperand(0); @@ -10823,6 +10833,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue BarOp = Op->getOperand(2); SDValue CntOp = Op->getOperand(3); SDValue M0Val; + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init + ? AMDGPU::S_BARRIER_INIT_M0 + : AMDGPU::S_BARRIER_SIGNAL_M0; // extract the BarrierID from bits 4-9 of BarOp SDValue BarID; BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp, @@ -10846,8 +10859,40 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); - auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_M0, DL, - Op->getVTList(), Ops); + auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); + return SDValue(NewMI, 0); + } + case Intrinsic::amdgcn_s_barrier_join: { + // these three intrinsics have one operand: barrier pointer + SDValue Chain = Op->getOperand(0); + SmallVector<SDValue, 2> Ops; + SDValue BarOp = Op->getOperand(2); + unsigned Opc; + + if (isa<ConstantSDNode>(BarOp)) { + uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue(); + Opc = AMDGPU::S_BARRIER_JOIN_IMM; + + // extract the BarrierID from bits 4-9 of the immediate + unsigned BarID = (BarVal >> 4) & 0x3F; + SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); + Ops.push_back(K); + Ops.push_back(Chain); + } else { + Opc = AMDGPU::S_BARRIER_JOIN_M0; + + // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0] + SDValue M0Val; + M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp, + DAG.getShiftAmountConstant(4, MVT::i32, DL)); + M0Val = + SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val, + DAG.getTargetConstant(0x3F, DL, MVT::i32)), + 0); + Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); + } + + auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); return SDValue(NewMI, 0); } case Intrinsic::amdgcn_s_prefetch_data: { @@ -11495,9 +11540,22 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { return FastLowered; SDLoc SL(Op); + EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); + SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS); + SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS); + + if (VT == MVT::bf16) { + SDValue ExtDiv = + DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags()); + return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv, + DAG.getTargetConstant(0, SL, MVT::i32)); + } + + assert(VT == MVT::f16); + // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d @@ -11514,9 +11572,6 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { // We will use ISD::FMA on targets that don't support ISD::FMAD. unsigned FMADOpCode = isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA; - - SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS); - SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS); SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt); SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags()); @@ -15684,7 +15739,7 @@ SDValue SITargetLowering::performFDivCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); EVT VT = N->getValueType(0); - if (VT != MVT::f16 || !Subtarget->has16BitInsts()) + if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts()) return SDValue(); SDValue LHS = N->getOperand(0); @@ -16849,6 +16904,11 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, const TargetRegisterClass *RC = nullptr; if (Constraint.size() == 1) { + // Check if we cannot determine the bit size of the given value type. This + // can happen, for example, in this situation where we have an empty struct + // (size 0): `call void asm "", "v"({} poison)`- + if (VT == MVT::Other) + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); const unsigned BitWidth = VT.getSizeInBits(); switch (Constraint[0]) { default: @@ -16897,13 +16957,26 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, } break; } - // We actually support i128, i16 and f16 as inline parameters - // even if they are not reported as legal - if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 || - VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16)) - return std::pair(0U, RC); + } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) { + const unsigned BitWidth = VT.getSizeInBits(); + switch (BitWidth) { + case 16: + RC = &AMDGPU::AV_32RegClass; + break; + default: + RC = TRI->getVectorSuperClassForBitWidth(BitWidth); + if (!RC) + return std::pair(0U, nullptr); + break; + } } + // We actually support i128, i16 and f16 as inline parameters + // even if they are not reported as legal + if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 || + VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16)) + return std::pair(0U, RC); + auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint); if (Kind != '\0') { if (Kind == 'v') { @@ -16916,7 +16989,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, if (RC) { if (NumRegs > 1) { - if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 > RC->getNumRegs()) + if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs()) return std::pair(0U, nullptr); uint32_t Width = NumRegs * 32; @@ -16988,6 +17061,9 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { case 'a': return C_RegisterClass; } + } else if (Constraint.size() == 2) { + if (Constraint == "VA") + return C_RegisterClass; } if (isImmConstraint(Constraint)) { return C_Other; @@ -17727,23 +17803,9 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) { /// Return if a flat address space atomicrmw can access private memory. static bool flatInstrMayAccessPrivate(const Instruction *I) { - const MDNode *NoaliasAddrSpaceMD = - I->getMetadata(LLVMContext::MD_noalias_addrspace); - if (!NoaliasAddrSpaceMD) - return true; - - for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E; - ++I) { - auto *Low = mdconst::extract<ConstantInt>( - NoaliasAddrSpaceMD->getOperand(2 * I + 0)); - if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) { - auto *High = mdconst::extract<ConstantInt>( - NoaliasAddrSpaceMD->getOperand(2 * I + 1)); - return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS); - } - } - - return true; + const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace); + return !MD || + !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS); } TargetLowering::AtomicExpansionKind diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 4b48fc4..343e455 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2341,6 +2341,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, case AMDGPU::S_MEMREALTIME: case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0: case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: + case AMDGPU::S_BARRIER_LEAVE: case AMDGPU::S_GET_BARRIER_STATE_M0: case AMDGPU::S_GET_BARRIER_STATE_IMM: ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 89d9b0d..50964a9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -473,6 +473,7 @@ class VIMAGE_VSAMPLE_Common <bits<8> op> : Enc96 { let Inst{4} = r128; let Inst{5} = d16; let Inst{6} = a16; + let Inst{7} = cpol{5}; // nv let Inst{21-14} = op; let Inst{25-22} = dmask; let Inst{39-32} = vdata; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 19e6bcf..cc4bee0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2616,9 +2616,9 @@ std::pair<MachineInstr*, MachineInstr*> SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); - if (ST.hasMovB64() && + if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) && AMDGPU::isLegalDPALU_DPPControl( - getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { + ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { MI.setDesc(get(AMDGPU::V_MOV_B64_dpp)); return std::pair(&MI, nullptr); } @@ -2905,7 +2905,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const { - assert(RS && "RegScavenger required for long branching"); assert(MBB.empty() && "new block should be inserted for expanding unconditional branch"); assert(MBB.pred_size() == 1); @@ -4241,6 +4240,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || MI.getOpcode() == AMDGPU::S_SETREG_B32 || MI.getOpcode() == AMDGPU::S_SETPRIO || + MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG || changesVGPRIndexingMode(MI); } @@ -4267,12 +4267,15 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { if (MI.memoperands_empty()) return true; - // TODO (?): Does this need to be taught how to read noalias.addrspace ? - // See if any memory operand specifies an address space that involves scratch. return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { unsigned AS = Memop->getAddrSpace(); - return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; + if (AS == AMDGPUAS::FLAT_ADDRESS) { + const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace; + return !MD || !AMDGPU::hasValueInRangeLikeMetadata( + *MD, AMDGPUAS::PRIVATE_ADDRESS); + } + return AS == AMDGPUAS::PRIVATE_ADDRESS; }); } @@ -5433,7 +5436,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && - !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) { + !AMDGPU::isLegalDPALU_DPPControl(ST, DC) && + AMDGPU::isDPALU_DPP(Desc, ST)) { ErrInfo = "Invalid dpp_ctrl value: " "DP ALU dpp only support row_newbcast"; return false; @@ -9225,7 +9229,7 @@ bool SIInstrInfo::isHighLatencyDef(int Opc) const { (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); } -unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, +Register SIInstrInfo::isStackAccess(const MachineInstr &MI, int &FrameIndex) const { const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); if (!Addr || !Addr->isFI()) @@ -9238,7 +9242,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); } -unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, +Register SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const { const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); assert(Addr && Addr->isFI()); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 6b9403f..12ffae7 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -996,6 +996,11 @@ public: bool isBarrier(unsigned Opcode) const { return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT || + Opcode == AMDGPU::S_BARRIER_INIT_M0 || + Opcode == AMDGPU::S_BARRIER_INIT_IMM || + Opcode == AMDGPU::S_BARRIER_JOIN_IMM || + Opcode == AMDGPU::S_BARRIER_LEAVE || + Opcode == AMDGPU::S_BARRIER_LEAVE_IMM || Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER; } @@ -1051,7 +1056,7 @@ public: } } - bool isWaitcnt(unsigned Opcode) const { + static bool isWaitcnt(unsigned Opcode) { switch (getNonSoftWaitcntOpcode(Opcode)) { case AMDGPU::S_WAITCNT: case AMDGPU::S_WAITCNT_VSCNT: @@ -1402,8 +1407,8 @@ public: return get(pseudoToMCOpcode(Opcode)); } - unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const; - unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const; + Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const; + Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const; Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index c552f1a..c425d97 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1954,6 +1954,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> { !eq(VT, v2f16) : VCSrc_v2f16, !eq(VT, v2bf16) : VCSrc_v2bf16, !eq(VT, f32) : VCSrc_f32, + !eq(VT, f64) : VCSrc_f64, !eq(VT, v2i32) : VCSrc_v2b32, 1 : VCSrc_b32); } @@ -2707,7 +2708,6 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { isModifierType<Src2VT>.ret, HasOMod); field bit HasNeg = HasModifiers; - field bit HasMatrixReuse = 0; field bit HasMatrixFMT = 0; field bit HasMatrixScale = 0; field bit HasMatrixReuse = 0; diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index b49c5a9..e204d6b 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -87,6 +87,8 @@ enum InstClassEnum { GLOBAL_STORE_SADDR, FLAT_LOAD, FLAT_STORE, + FLAT_LOAD_SADDR, + FLAT_STORE_SADDR, GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of GLOBAL_STORE // any CombineInfo, they are only ever returned by // getCommonInstClass. @@ -354,6 +356,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORD_SADDR: case AMDGPU::FLAT_LOAD_DWORD: case AMDGPU::FLAT_STORE_DWORD: + case AMDGPU::FLAT_LOAD_DWORD_SADDR: + case AMDGPU::FLAT_STORE_DWORD_SADDR: return 1; case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: @@ -367,6 +371,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: case AMDGPU::FLAT_LOAD_DWORDX2: case AMDGPU::FLAT_STORE_DWORDX2: + case AMDGPU::FLAT_LOAD_DWORDX2_SADDR: + case AMDGPU::FLAT_STORE_DWORDX2_SADDR: return 2; case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: @@ -380,6 +386,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: case AMDGPU::FLAT_LOAD_DWORDX3: case AMDGPU::FLAT_STORE_DWORDX3: + case AMDGPU::FLAT_LOAD_DWORDX3_SADDR: + case AMDGPU::FLAT_STORE_DWORDX3_SADDR: return 3; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: @@ -393,6 +401,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: case AMDGPU::FLAT_LOAD_DWORDX4: case AMDGPU::FLAT_STORE_DWORDX4: + case AMDGPU::FLAT_LOAD_DWORDX4_SADDR: + case AMDGPU::FLAT_STORE_DWORDX4_SADDR: return 4; case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: @@ -575,6 +585,16 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: return GLOBAL_STORE_SADDR; + case AMDGPU::FLAT_LOAD_DWORD_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX2_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX3_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX4_SADDR: + return FLAT_LOAD_SADDR; + case AMDGPU::FLAT_STORE_DWORD_SADDR: + case AMDGPU::FLAT_STORE_DWORDX2_SADDR: + case AMDGPU::FLAT_STORE_DWORDX3_SADDR: + case AMDGPU::FLAT_STORE_DWORDX4_SADDR: + return FLAT_STORE_SADDR; } } @@ -661,6 +681,16 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: return AMDGPU::GLOBAL_STORE_DWORD_SADDR; + case AMDGPU::FLAT_LOAD_DWORD_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX2_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX3_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX4_SADDR: + return AMDGPU::FLAT_LOAD_DWORD_SADDR; + case AMDGPU::FLAT_STORE_DWORD_SADDR: + case AMDGPU::FLAT_STORE_DWORDX2_SADDR: + case AMDGPU::FLAT_STORE_DWORDX3_SADDR: + case AMDGPU::FLAT_STORE_DWORDX4_SADDR: + return AMDGPU::FLAT_STORE_DWORD_SADDR; } } @@ -776,6 +806,14 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: + case AMDGPU::FLAT_LOAD_DWORD_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX2_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX3_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX4_SADDR: + case AMDGPU::FLAT_STORE_DWORD_SADDR: + case AMDGPU::FLAT_STORE_DWORDX2_SADDR: + case AMDGPU::FLAT_STORE_DWORDX3_SADDR: + case AMDGPU::FLAT_STORE_DWORDX4_SADDR: Result.SAddr = true; [[fallthrough]]; case AMDGPU::GLOBAL_LOAD_DWORD: @@ -1875,6 +1913,28 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, case 4: return AMDGPU::FLAT_STORE_DWORDX4; } + case FLAT_LOAD_SADDR: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::FLAT_LOAD_DWORDX2_SADDR; + case 3: + return AMDGPU::FLAT_LOAD_DWORDX3_SADDR; + case 4: + return AMDGPU::FLAT_LOAD_DWORDX4_SADDR; + } + case FLAT_STORE_SADDR: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::FLAT_STORE_DWORDX2_SADDR; + case 3: + return AMDGPU::FLAT_STORE_DWORDX3_SADDR; + case 4: + return AMDGPU::FLAT_STORE_DWORDX4_SADDR; + } case MIMG: assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && "No overlaps"); @@ -2508,12 +2568,14 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( OptimizeListAgain |= CI.Width + Paired.Width < 4; break; case FLAT_LOAD: + case FLAT_LOAD_SADDR: case GLOBAL_LOAD: case GLOBAL_LOAD_SADDR: NewMI = mergeFlatLoadPair(CI, Paired, Where->I); OptimizeListAgain |= CI.Width + Paired.Width < 4; break; case FLAT_STORE: + case FLAT_STORE_SADDR: case GLOBAL_STORE: case GLOBAL_STORE_SADDR: NewMI = mergeFlatStorePair(CI, Paired, Where->I); diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index f8878f3..e97536d 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -57,6 +57,7 @@ #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -76,10 +77,11 @@ private: LiveIntervals *LIS = nullptr; LiveVariables *LV = nullptr; MachineDominatorTree *MDT = nullptr; + MachinePostDominatorTree *PDT = nullptr; MachineRegisterInfo *MRI = nullptr; SetVector<MachineInstr*> LoweredEndCf; DenseSet<Register> LoweredIf; - SmallSet<MachineBasicBlock *, 4> KillBlocks; + SmallPtrSet<MachineBasicBlock *, 4> KillBlocks; SmallSet<Register, 8> RecomputeRegs; const TargetRegisterClass *BoolRC = nullptr; @@ -138,8 +140,8 @@ private: public: SILowerControlFlow(LiveIntervals *LIS, LiveVariables *LV, - MachineDominatorTree *MDT) - : LIS(LIS), LV(LV), MDT(MDT) {} + MachineDominatorTree *MDT, MachinePostDominatorTree *PDT) + : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT) {} bool run(MachineFunction &MF); }; @@ -159,6 +161,7 @@ public: AU.addUsedIfAvailable<LiveIntervalsWrapperPass>(); // Should preserve the same set that TwoAddressInstructions does. AU.addPreserved<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachinePostDominatorTreeWrapperPass>(); AU.addPreserved<SlotIndexesWrapperPass>(); AU.addPreserved<LiveIntervalsWrapperPass>(); AU.addPreserved<LiveVariablesWrapperPass>(); @@ -457,7 +460,7 @@ MachineBasicBlock::iterator SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { - SmallSet<const MachineBasicBlock *, 4> Visited; + SmallPtrSet<const MachineBasicBlock *, 4> Visited; MachineBasicBlock *B = &MBB; do { if (!Visited.insert(B).second) @@ -506,13 +509,18 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock *SplitBB = &MBB; if (NeedBlockSplit) { SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS); - if (MDT && SplitBB != &MBB) { - MachineDomTreeNode *MBBNode = (*MDT)[&MBB]; - SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(), - MBBNode->end()); - MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB); - for (MachineDomTreeNode *Child : Children) - MDT->changeImmediateDominator(Child, SplitBBNode); + if (SplitBB != &MBB && (MDT || PDT)) { + using DomTreeT = DomTreeBase<MachineBasicBlock>; + SmallVector<DomTreeT::UpdateType, 16> DTUpdates; + for (MachineBasicBlock *Succ : SplitBB->successors()) { + DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); + DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ}); + } + DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB}); + if (MDT) + MDT->applyUpdates(DTUpdates); + if (PDT) + PDT->applyUpdates(DTUpdates); } Opcode = OrTermrOpc; InsPt = MI; @@ -727,26 +735,27 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { MachineBasicBlock *Succ = *MBB.succ_begin(); MachineBasicBlock *FallThrough = nullptr; + using DomTreeT = DomTreeBase<MachineBasicBlock>; + SmallVector<DomTreeT::UpdateType, 8> DTUpdates; + while (!MBB.predecessors().empty()) { MachineBasicBlock *P = *MBB.pred_begin(); if (P->getFallThrough(false) == &MBB) FallThrough = P; P->ReplaceUsesOfBlockWith(&MBB, Succ); + DTUpdates.push_back({DomTreeT::Insert, P, Succ}); + DTUpdates.push_back({DomTreeT::Delete, P, &MBB}); } MBB.removeSuccessor(Succ); if (LIS) { for (auto &I : MBB.instrs()) LIS->RemoveMachineInstrFromMaps(I); } - if (MDT) { - // If Succ, the single successor of MBB, is dominated by MBB, MDT needs - // updating by changing Succ's idom to the one of MBB; otherwise, MBB must - // be a leaf node in MDT and could be erased directly. - if (MDT->dominates(&MBB, Succ)) - MDT->changeImmediateDominator(MDT->getNode(Succ), - MDT->getNode(&MBB)->getIDom()); - MDT->eraseNode(&MBB); - } + if (MDT) + MDT->applyUpdates(DTUpdates); + if (PDT) + PDT->applyUpdates(DTUpdates); + MBB.clear(); MBB.eraseFromParent(); if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) { @@ -875,7 +884,11 @@ bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) { LiveVariables *LV = LVWrapper ? &LVWrapper->getLV() : nullptr; auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>(); MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; - return SILowerControlFlow(LIS, LV, MDT).run(MF); + auto *PDTWrapper = + getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>(); + MachinePostDominatorTree *PDT = + PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr; + return SILowerControlFlow(LIS, LV, MDT, PDT).run(MF); } PreservedAnalyses @@ -885,13 +898,16 @@ SILowerControlFlowPass::run(MachineFunction &MF, LiveVariables *LV = MFAM.getCachedResult<LiveVariablesAnalysis>(MF); MachineDominatorTree *MDT = MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF); + MachinePostDominatorTree *PDT = + MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF); - bool Changed = SILowerControlFlow(LIS, LV, MDT).run(MF); + bool Changed = SILowerControlFlow(LIS, LV, MDT, PDT).run(MF); if (!Changed) return PreservedAnalyses::all(); auto PA = getMachineFunctionPassPreservedAnalyses(); PA.preserve<MachineDominatorTreeAnalysis>(); + PA.preserve<MachinePostDominatorTreeAnalysis>(); PA.preserve<SlotIndexesAnalysis>(); PA.preserve<LiveIntervalsAnalysis>(); PA.preserve<LiveVariablesAnalysis>(); diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 9509199..09b737c 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -209,10 +209,13 @@ void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) { // So set the save points for those. // Use the points found by shrink-wrapping, if any. - if (MFI.getSavePoint()) { - SaveBlocks.push_back(MFI.getSavePoint()); - assert(MFI.getRestorePoint() && "Both restore and save must be set"); - MachineBasicBlock *RestoreBlock = MFI.getRestorePoint(); + if (!MFI.getSavePoints().empty()) { + assert(MFI.getSavePoints().size() == 1 && + "Multiple save points not yet supported!"); + SaveBlocks.push_back(MFI.getSavePoints().front()); + assert(MFI.getRestorePoints().size() == 1 && + "Multiple restore points not yet supported!"); + MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front(); // If RestoreBlock does not have any successor and is not a return block // then the end point is unreachable and we do not need to insert any // epilogue. diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 9a1448f..8a11203 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -33,7 +33,7 @@ using namespace llvm; // optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases // where it is better to produce the VGPR form (e.g. if there are VGPR users // of the MFMA result). -cl::opt<bool> MFMAVGPRForm( +static cl::opt<bool> MFMAVGPRForm( "amdgpu-mfma-vgpr-form", cl::Hidden, cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), @@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), HasSpilledSGPRs(MFI.hasSpilledSGPRs()), HasSpilledVGPRs(MFI.hasSpilledVGPRs()), + NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()), + NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()), HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), Occupancy(MFI.getOccupancy()), ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), @@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( WaveLimiter = YamlMFI.WaveLimiter; HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs; HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs; + NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs; + NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs; BytesInStackArgArea = YamlMFI.BytesInStackArgArea; ReturnsVoid = YamlMFI.ReturnsVoid; IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 08b0206..ca8f803 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -270,6 +270,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { bool WaveLimiter = false; bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; + uint16_t NumWaveDispatchSGPRs = 0; + uint16_t NumWaveDispatchVGPRs = 0; uint32_t HighBitsOf32BitAddress = 0; // TODO: 10 may be a better default since it's the maximum. @@ -327,6 +329,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false); YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false); + YamlIO.mapOptional("numWaveDispatchSGPRs", MFI.NumWaveDispatchSGPRs, false); + YamlIO.mapOptional("numWaveDispatchVGPRs", MFI.NumWaveDispatchVGPRs, false); YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg, StringValue("$private_rsrc_reg")); YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg, @@ -465,6 +469,9 @@ private: unsigned NumUserSGPRs = 0; unsigned NumSystemSGPRs = 0; + unsigned NumWaveDispatchSGPRs = 0; + unsigned NumWaveDispatchVGPRs = 0; + bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; bool HasNonSpillStackObjects = false; @@ -991,6 +998,14 @@ public: return UserSGPRInfo.getNumKernargPreloadSGPRs(); } + unsigned getNumWaveDispatchSGPRs() const { return NumWaveDispatchSGPRs; } + + void setNumWaveDispatchSGPRs(unsigned Count) { NumWaveDispatchSGPRs = Count; } + + unsigned getNumWaveDispatchVGPRs() const { return NumWaveDispatchVGPRs; } + + void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; } + Register getPrivateSegmentWaveByteOffsetSystemSGPR() const { return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index 205a45a..38d9a4b 100644 --- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -130,6 +130,9 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { if (VirtReg.isPhysical()) continue; + if (!VirtReg.isValid()) + continue; + if (!VRM->hasPhys(VirtReg)) continue; diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 5940f45..93ba0a3 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -73,6 +73,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) { NumSGPRsForWavesPerEU = ZeroExpr; NumVGPRsForWavesPerEU = ZeroExpr; + NamedBarCnt = ZeroExpr; Occupancy = ZeroExpr; DynamicCallStack = ZeroExpr; VCCUsed = ZeroExpr; diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index 79099d2..171c4a3 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -83,6 +83,9 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { // Number of VGPRs that meets number of waves per execution unit request. const MCExpr *NumVGPRsForWavesPerEU = nullptr; + // Number of named barriers used by the kernel. + const MCExpr *NamedBarCnt = nullptr; + // Final occupancy. const MCExpr *Occupancy = nullptr; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 81655f5..0293d40 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1166,7 +1166,8 @@ class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName> } //===----------------------------------------------------------------------===// -// SSrc_* Operands with an SGPR or a 32-bit immediate +// SSrc_* Operands with an SGPR, a 32-bit immediate, or 64-bit immediate +// if supported by target. //===----------------------------------------------------------------------===// class SrcRegOrImm9<RegisterClass regClass, string operandType> diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 431d73b..a003a46 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -484,6 +484,24 @@ def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (o let isConvergent = 1; } +def S_BARRIER_INIT_M0 : SOP1_Pseudo <"s_barrier_init m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_INIT_IMM : SOP1_Pseudo <"s_barrier_init", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + } // End Uses = [M0] def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs), @@ -501,6 +519,12 @@ def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (out let isConvergent = 1; } +def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + } // End has_sdst = 0 def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst), @@ -1588,6 +1612,17 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm let isConvergent = 1; } +def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins)> { + let SchedRW = [WriteBarrier]; + let simm16 = 0; + let fixed_imm = 1; + let isConvergent = 1; + let Defs = [SCC]; +} + +def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave", + (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>; + def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > { let SubtargetPredicate = isGFX8Plus; let simm16 = 0; @@ -1630,7 +1665,9 @@ def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> { def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; -def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; +def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> { + let SubtargetPredicate = isNotGFX1250Plus; +} // On SI the documentation says sleep for approximately 64 * low 2 // bits, consistent with the reported maximum of 448. On VI the @@ -2144,9 +2181,13 @@ defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11_gfx12<0x04d>; defm S_BARRIER_SIGNAL_M0 : SOP1_M0_Real_gfx12<0x04e>; defm S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_M0_Real_gfx12<0x04f>; defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12<0x050>; +defm S_BARRIER_INIT_M0 : SOP1_M0_Real_gfx12<0x051>; +defm S_BARRIER_JOIN_M0 : SOP1_M0_Real_gfx12<0x052>; defm S_BARRIER_SIGNAL_IMM : SOP1_IMM_Real_gfx12<0x04e>; defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12<0x04f>; defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12<0x050>; +defm S_BARRIER_INIT_IMM : SOP1_IMM_Real_gfx12<0x051>; +defm S_BARRIER_JOIN_IMM : SOP1_IMM_Real_gfx12<0x052>; defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>; defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>; @@ -2639,6 +2680,7 @@ multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> { } defm S_BARRIER_WAIT : SOPP_Real_32_gfx12<0x014>; +defm S_BARRIER_LEAVE : SOPP_Real_32_gfx12<0x015>; defm S_WAIT_LOADCNT : SOPP_Real_32_gfx12<0x040>; defm S_WAIT_STORECNT : SOPP_Real_32_gfx12<0x041>; defm S_WAIT_SAMPLECNT : SOPP_Real_32_gfx12<0x042>; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 3d9455f..c740b5e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -106,7 +106,7 @@ static constexpr CustomOperand MsgOperands[] = { {{"MSG_GET_DDID"}, ID_GET_DDID, isGFX10}, {{"MSG_HS_TESSFACTOR"}, ID_HS_TESSFACTOR_GFX11Plus, isGFX11Plus}, {{"MSG_DEALLOC_VGPRS"}, ID_DEALLOC_VGPRS_GFX11Plus, isGFX11Plus}, - {{""}}, + {{"MSG_SAVEWAVE_HAS_TDM"}, ID_SAVEWAVE_HAS_TDM, isGFX1250}, {{"MSG_SYSMSG"}, ID_SYSMSG}, {{"MSG_RTN_GET_DOORBELL"}, ID_RTN_GET_DOORBELL, isGFX11Plus}, {{"MSG_RTN_GET_DDID"}, ID_RTN_GET_DDID, isGFX11Plus}, @@ -195,7 +195,7 @@ static constexpr CustomOperand Operands[] = { {{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10}, {{""}}, {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11}, - {{""}}, + {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250}, {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11}, {{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus}, {{"HW_REG_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus}, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 1e3e9a2..6e4e087 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -1160,17 +1161,28 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) { return 65536; if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) return 163840; + if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) + return 327680; return 0; } unsigned getEUsPerCU(const MCSubtargetInfo *STI) { // "Per CU" really means "per whatever functional block the waves of a - // workgroup must share". For gfx10 in CU mode this is the CU, which contains + // workgroup must share". + + // GFX12.5 only supports CU mode, which contains four SIMDs. + if (isGFX1250(*STI)) { + assert(STI->getFeatureBits().test(FeatureCuMode)); + return 4; + } + + // For gfx10 in CU mode the functional block is the CU, which contains // two SIMDs. if (isGFX10Plus(*STI) && STI->getFeatureBits().test(FeatureCuMode)) return 2; - // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains - // two CUs, so a total of four SIMDs. + + // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP + // contains two CUs, so a total of four SIMDs. return 4; } @@ -1666,6 +1678,29 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size) { return Vals; } +bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) { + assert((MD.getNumOperands() % 2 == 0) && "invalid number of operands!"); + for (unsigned I = 0, E = MD.getNumOperands() / 2; I != E; ++I) { + auto Low = + mdconst::extract<ConstantInt>(MD.getOperand(2 * I + 0))->getValue(); + auto High = + mdconst::extract<ConstantInt>(MD.getOperand(2 * I + 1))->getValue(); + // There are two types of [A; B) ranges: + // A < B, e.g. [4; 5) which is a range that only includes 4. + // A > B, e.g. [5; 4) which is a range that wraps around and includes + // everything except 4. + if (Low.ult(High)) { + if (Low.ule(Val) && High.ugt(Val)) + return true; + } else { + if (Low.uge(Val) && High.ult(Val)) + return true; + } + } + + return false; +} + unsigned getVmcntBitMask(const IsaVersion &Version) { return (1 << (getVmcntBitWidthLo(Version.Major) + getVmcntBitWidthHi(Version.Major))) - @@ -2406,7 +2441,11 @@ unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler) { return 0; } -unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) { return 16; } +unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) { + if (isGFX1250(STI)) + return 32; + return 16; +} bool isSI(const MCSubtargetInfo &STI) { return STI.hasFeature(AMDGPU::FeatureSouthernIslands); @@ -2478,6 +2517,12 @@ bool isGFX1250(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts]; } +bool supportsWGP(const MCSubtargetInfo &STI) { + if (isGFX1250(STI)) + return false; + return isGFX10Plus(STI); +} + bool isNotGFX11Plus(const MCSubtargetInfo &STI) { return !isGFX11Plus(STI); } bool isNotGFX10Plus(const MCSubtargetInfo &STI) { @@ -3309,13 +3354,39 @@ bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) { return false; } -bool isDPALU_DPP(const MCInstrDesc &OpDesc) { +bool isDPALU_DPP32BitOpc(unsigned Opc) { + switch (Opc) { + case AMDGPU::V_MUL_LO_U32_e64: + case AMDGPU::V_MUL_LO_U32_e64_dpp: + case AMDGPU::V_MUL_LO_U32_e64_dpp_gfx1250: + case AMDGPU::V_MUL_HI_U32_e64: + case AMDGPU::V_MUL_HI_U32_e64_dpp: + case AMDGPU::V_MUL_HI_U32_e64_dpp_gfx1250: + case AMDGPU::V_MUL_HI_I32_e64: + case AMDGPU::V_MUL_HI_I32_e64_dpp: + case AMDGPU::V_MUL_HI_I32_e64_dpp_gfx1250: + case AMDGPU::V_MAD_U32_e64: + case AMDGPU::V_MAD_U32_e64_dpp: + case AMDGPU::V_MAD_U32_e64_dpp_gfx1250: + return true; + default: + return false; + } +} + +bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) { + if (!ST.hasFeature(AMDGPU::FeatureDPALU_DPP)) + return false; + + if (isDPALU_DPP32BitOpc(OpDesc.getOpcode())) + return ST.hasFeature(AMDGPU::FeatureGFX1250Insts); + return hasAny64BitVGPROperands(OpDesc); } unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { - // Currently this is 128 for all subtargets - return 128; + return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256 + : 128; } bool isPackedFP32Inst(unsigned Opc) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 1bcd36c..70dfb63 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -35,6 +35,7 @@ class MCInstrInfo; class MCRegisterClass; class MCRegisterInfo; class MCSubtargetInfo; +class MDNode; class StringRef; class Triple; class raw_ostream; @@ -1064,6 +1065,9 @@ SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name, std::optional<SmallVector<unsigned>> getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size); +/// Checks if \p Val is inside \p MD, a !range-like metadata. +bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val); + /// Represents the counter values to wait for in an s_waitcnt instruction. /// /// Large values (including the maximum possible integer) can be used to @@ -1549,6 +1553,7 @@ bool isGFX11Plus(const MCSubtargetInfo &STI); bool isGFX12(const MCSubtargetInfo &STI); bool isGFX12Plus(const MCSubtargetInfo &STI); bool isGFX1250(const MCSubtargetInfo &STI); +bool supportsWGP(const MCSubtargetInfo &STI); bool isNotGFX12Plus(const MCSubtargetInfo &STI); bool isNotGFX11Plus(const MCSubtargetInfo &STI); bool isGCN3Encoding(const MCSubtargetInfo &STI); @@ -1750,15 +1755,22 @@ unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST); bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); LLVM_READNONE -inline bool isLegalDPALU_DPPControl(unsigned DC) { - return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST; +inline bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC) { + if (isGFX12(ST)) + return DC >= DPP::ROW_SHARE_FIRST && DC <= DPP::ROW_SHARE_LAST; + if (isGFX90A(ST)) + return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST; + return false; } /// \returns true if an instruction may have a 64-bit VGPR operand. bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc); +/// \returns true if an instruction is a DP ALU DPP without any 64-bit operands. +bool isDPALU_DPP32BitOpc(unsigned Opc); + /// \returns true if an instruction is a DP ALU DPP. -bool isDPALU_DPP(const MCInstrDesc &OpDesc); +bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST); /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index b128207..11c7275 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -706,7 +706,6 @@ def V_CVT_F16_F8_Fake16_Profile : VOP3_Profile_Fake16<V_CVT_F16_F8_Profile>; let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in { - // FIXME: This differs from downstream due to changes that haven't been upstreamed yet. let SubtargetPredicate = isGFX12PlusNot12_50 in defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>; let SubtargetPredicate = isGFX125xOnly in @@ -731,7 +730,6 @@ class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst, bit HasOpSe >; let OtherPredicates = [HasFP8ConversionInsts] in { - // FIXME: This differs from downstream due to changes that haven't been upstreamed yet. let SubtargetPredicate = isGFX12PlusNot12_50 in def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>; let SubtargetPredicate = isGFX125xOnly in { @@ -740,7 +738,6 @@ let OtherPredicates = [HasFP8ConversionInsts] in { def : GCNPat<(int_amdgcn_cvt_f32_fp8_e5m3 i32:$src0, timm:$byte_sel), (V_CVT_F32_FP8_gfx1250_e64 $src0, DSTCLAMP.ENABLE, (as_i32timm $byte_sel))>; } - // FIXME: This differs from downstream due to changes that haven't been upstreamed yet. let SubtargetPredicate = isGFX12Plus in def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>; } @@ -1058,11 +1055,6 @@ multiclass VOP1Only_Real_gfx11_gfx12<bits<9> op> : multiclass VOP1_Real_FULL_gfx11_gfx12<bits<9> op> : VOP1_Real_FULL<GFX11Gen, op>, VOP1_Real_FULL<GFX12Gen, op>; -multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op, - string opName, string asmName> : - VOP1_Real_e32_with_name<Gen, op, opName, asmName>, - VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>; - multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250< bits<9> op, string asmName = !tolower(NAME), string opName = NAME> { defm opName#"_t16" : diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index f4b6af6..329d003 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -2084,6 +2084,9 @@ multiclass VOP3_Realtriple_gfx11_gfx12<bits<10> op> : multiclass VOP3_Real_Base_gfx11_gfx12<bits<10> op> : VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Gen, op>; +multiclass VOP3_Real_Base_gfx11_gfx12_not_gfx1250<bits<10> op> : + VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Not12_50Gen, op>; + multiclass VOP3_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName, string asmName> : VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName>, @@ -2211,9 +2214,9 @@ defm V_MUL_F64 : VOP3_Real_Base_gfx11<0x328>; defm V_MIN_F64 : VOP3_Real_Base_gfx11<0x329>; defm V_MAX_F64 : VOP3_Real_Base_gfx11<0x32a>; defm V_LDEXP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32b>; -defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12<0x32c>; -defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12<0x32d>; -defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12<0x32e>; +defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32c>; +defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32d>; +defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32e>; defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32f>; defm V_LSHLREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x338, "v_lshlrev_b16">; defm V_LSHRREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x339, "v_lshrrev_b16">; @@ -2242,6 +2245,10 @@ let AssemblerPredicate = isGFX11Plus in { } // These instructions differ from GFX12 variant by supporting DPP: +defm V_MUL_LO_U32 : VOP3Only_Realtriple_gfx1250<0x32c>; +defm V_MUL_HI_U32 : VOP3Only_Realtriple_gfx1250<0x32d>; +defm V_MUL_HI_I32 : VOP3Only_Realtriple_gfx1250<0x32e>; + defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>; defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>; defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>; diff --git a/llvm/lib/Target/ARM/ARMCallingConv.h b/llvm/lib/Target/ARM/ARMCallingConv.h index 7c692f0..b6b2d59 100644 --- a/llvm/lib/Target/ARM/ARMCallingConv.h +++ b/llvm/lib/Target/ARM/ARMCallingConv.h @@ -19,34 +19,35 @@ namespace llvm { bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); } // namespace llvm diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index ef69083..c53e215 100644 --- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -109,7 +109,7 @@ namespace { /// NewWaterList - The subset of WaterList that was created since the /// previous iteration by inserting unconditional branches. - SmallSet<MachineBasicBlock*, 4> NewWaterList; + SmallPtrSet<MachineBasicBlock *, 4> NewWaterList; using water_iterator = std::vector<MachineBasicBlock *>::iterator; diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp index 7ba2487..14e1160 100644 --- a/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -1943,8 +1943,11 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, unsigned &NumBytes, bool isVarArg) { SmallVector<CCValAssign, 16> ArgLocs; + SmallVector<Type *, 16> OrigTys; + for (Value *Arg : Args) + OrigTys.push_back(Arg->getType()); CCState CCInfo(CC, isVarArg, *FuncInfo.MF, ArgLocs, *Context); - CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, OrigTys, CCAssignFnForCall(CC, false, isVarArg)); // Check that we can handle all of the arguments. If we can't, then bail out @@ -2093,7 +2096,8 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs, if (RetVT != MVT::isVoid) { SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context); - CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg)); + CCInfo.AnalyzeCallResult(RetVT, I->getType(), + CCAssignFnForCall(CC, true, isVarArg)); // Copy all of the result registers out of their specified physreg. if (RVLocs.size() == 2 && RetVT == MVT::f64) { @@ -2278,7 +2282,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { if (RetVT != MVT::isVoid && RetVT != MVT::i32) { SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context); - CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, false)); + CCInfo.AnalyzeCallResult(RetVT, RetTy, CCAssignFnForCall(CC, true, false)); if (RVLocs.size() >= 2 && RetVT != MVT::f64) return false; } @@ -2389,7 +2393,8 @@ bool ARMFastISel::SelectCall(const Instruction *I, RetVT != MVT::i16 && RetVT != MVT::i32) { SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context); - CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg)); + CCInfo.AnalyzeCallResult(RetVT, RetTy, + CCAssignFnForCall(CC, true, isVarArg)); if (RVLocs.size() >= 2 && RetVT != MVT::f64) return false; } @@ -2499,6 +2504,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, // Set all unused physreg defs as dead. static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + diagnoseDontCall(*CI); return true; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index ea99cc4..8301563 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -587,167 +587,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, } } - // RTLIB - if (TM.isAAPCS_ABI() && (TT.isTargetAEABI() || TT.isTargetGNUAEABI() || - TT.isTargetMuslAEABI() || TT.isAndroid())) { - // FIXME: This does not depend on the subtarget and should go directly into - // RuntimeLibcalls. This is only here because of missing support for setting - // the calling convention of an implementation. - // clang-format off - static const struct { - const RTLIB::Libcall Op; - const RTLIB::LibcallImpl Impl; - } LibraryCalls[] = { - // Double-precision floating-point arithmetic helper functions - // RTABI chapter 4.1.2, Table 2 - { RTLIB::ADD_F64, RTLIB::__aeabi_dadd }, - { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv }, - { RTLIB::MUL_F64, RTLIB::__aeabi_dmul }, - { RTLIB::SUB_F64, RTLIB::__aeabi_dsub }, - - // Double-precision floating-point comparison helper functions - // RTABI chapter 4.1.2, Table 3 - { RTLIB::OEQ_F64, RTLIB::__aeabi_dcmpeq__oeq }, - { RTLIB::UNE_F64, RTLIB::__aeabi_dcmpeq__une }, - { RTLIB::OLT_F64, RTLIB::__aeabi_dcmplt }, - { RTLIB::OLE_F64, RTLIB::__aeabi_dcmple }, - { RTLIB::OGE_F64, RTLIB::__aeabi_dcmpge }, - { RTLIB::OGT_F64, RTLIB::__aeabi_dcmpgt }, - { RTLIB::UO_F64, RTLIB::__aeabi_dcmpun }, - - // Single-precision floating-point arithmetic helper functions - // RTABI chapter 4.1.2, Table 4 - { RTLIB::ADD_F32, RTLIB::__aeabi_fadd }, - { RTLIB::DIV_F32, RTLIB::__aeabi_fdiv }, - { RTLIB::MUL_F32, RTLIB::__aeabi_fmul }, - { RTLIB::SUB_F32, RTLIB::__aeabi_fsub }, - - // Single-precision floating-point comparison helper functions - // RTABI chapter 4.1.2, Table 5 - { RTLIB::OEQ_F32, RTLIB::__aeabi_fcmpeq__oeq }, - { RTLIB::UNE_F32, RTLIB::__aeabi_fcmpeq__une }, - { RTLIB::OLT_F32, RTLIB::__aeabi_fcmplt}, - { RTLIB::OLE_F32, RTLIB::__aeabi_fcmple }, - { RTLIB::OGE_F32, RTLIB::__aeabi_fcmpge }, - { RTLIB::OGT_F32, RTLIB::__aeabi_fcmpgt }, - { RTLIB::UO_F32, RTLIB::__aeabi_fcmpun }, - - // Floating-point to integer conversions. - // RTABI chapter 4.1.2, Table 6 - { RTLIB::FPTOSINT_F64_I32, RTLIB::__aeabi_d2iz }, - { RTLIB::FPTOUINT_F64_I32, RTLIB::__aeabi_d2uiz }, - { RTLIB::FPTOSINT_F64_I64, RTLIB::__aeabi_d2lz }, - { RTLIB::FPTOUINT_F64_I64, RTLIB::__aeabi_d2ulz }, - { RTLIB::FPTOSINT_F32_I32, RTLIB::__aeabi_f2iz }, - { RTLIB::FPTOUINT_F32_I32, RTLIB::__aeabi_f2uiz }, - { RTLIB::FPTOSINT_F32_I64, RTLIB::__aeabi_f2lz }, - { RTLIB::FPTOUINT_F32_I64, RTLIB::__aeabi_f2ulz }, - - // Conversions between floating types. - // RTABI chapter 4.1.2, Table 7 - { RTLIB::FPROUND_F64_F32, RTLIB::__aeabi_d2f }, - { RTLIB::FPROUND_F64_F16, RTLIB::__aeabi_d2h }, - { RTLIB::FPEXT_F32_F64, RTLIB::__aeabi_f2d }, - - // Integer to floating-point conversions. - // RTABI chapter 4.1.2, Table 8 - { RTLIB::SINTTOFP_I32_F64, RTLIB::__aeabi_i2d }, - { RTLIB::UINTTOFP_I32_F64, RTLIB::__aeabi_ui2d }, - { RTLIB::SINTTOFP_I64_F64, RTLIB::__aeabi_l2d }, - { RTLIB::UINTTOFP_I64_F64, RTLIB::__aeabi_ul2d }, - { RTLIB::SINTTOFP_I32_F32, RTLIB::__aeabi_i2f }, - { RTLIB::UINTTOFP_I32_F32, RTLIB::__aeabi_ui2f }, - { RTLIB::SINTTOFP_I64_F32, RTLIB::__aeabi_l2f }, - { RTLIB::UINTTOFP_I64_F32, RTLIB::__aeabi_ul2f }, - - // Long long helper functions - // RTABI chapter 4.2, Table 9 - { RTLIB::MUL_I64, RTLIB::__aeabi_lmul }, - { RTLIB::SHL_I64, RTLIB::__aeabi_llsl }, - { RTLIB::SRL_I64, RTLIB::__aeabi_llsr }, - { RTLIB::SRA_I64, RTLIB::__aeabi_lasr }, - - // Integer division functions - // RTABI chapter 4.3.1 - { RTLIB::SDIV_I32, RTLIB::__aeabi_idiv }, - { RTLIB::SDIV_I64, RTLIB::__aeabi_ldivmod }, - { RTLIB::UDIV_I32, RTLIB::__aeabi_uidiv }, - { RTLIB::UDIV_I64, RTLIB::__aeabi_uldivmod }, - }; - // clang-format on - - for (const auto &LC : LibraryCalls) - setLibcallImpl(LC.Op, LC.Impl); - - // EABI dependent RTLIB - if (TM.Options.EABIVersion == EABI::EABI4 || - TM.Options.EABIVersion == EABI::EABI5) { - static const struct { - const RTLIB::Libcall Op; - const RTLIB::LibcallImpl Impl; - } MemOpsLibraryCalls[] = { - // Memory operations - // RTABI chapter 4.3.4 - {RTLIB::MEMCPY, RTLIB::__aeabi_memcpy}, - {RTLIB::MEMMOVE, RTLIB::__aeabi_memmove}, - {RTLIB::MEMSET, RTLIB::__aeabi_memset}, - {RTLIB::AEABI_MEMCPY4, RTLIB::__aeabi_memcpy4}, - {RTLIB::AEABI_MEMCPY8, RTLIB::__aeabi_memcpy8}, - {RTLIB::AEABI_MEMMOVE4, RTLIB::__aeabi_memmove4}, - {RTLIB::AEABI_MEMMOVE8, RTLIB::__aeabi_memmove8}, - {RTLIB::AEABI_MEMSET4, RTLIB::__aeabi_memset4}, - {RTLIB::AEABI_MEMSET8, RTLIB::__aeabi_memset8}, - {RTLIB::AEABI_MEMCLR, RTLIB::__aeabi_memclr}, - {RTLIB::AEABI_MEMCLR4, RTLIB::__aeabi_memclr4}, - {RTLIB::AEABI_MEMCLR8, RTLIB::__aeabi_memclr8}, - }; - - for (const auto &LC : MemOpsLibraryCalls) - setLibcallImpl(LC.Op, LC.Impl); - } - } - - // The half <-> float conversion functions are always soft-float on - // non-watchos platforms, but are needed for some targets which use a - // hard-float calling convention by default. - if (!TT.isWatchABI()) { - if (TM.isAAPCS_ABI()) { - setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_AAPCS); - setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_AAPCS); - setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_AAPCS); - setLibcallImplCallingConv(RTLIB::__gnu_h2f_ieee, CallingConv::ARM_AAPCS); - setLibcallImplCallingConv(RTLIB::__gnu_f2h_ieee, CallingConv::ARM_AAPCS); - } else { - setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_APCS); - setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_APCS); - setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_APCS); - setLibcallImplCallingConv(RTLIB::__gnu_h2f_ieee, CallingConv::ARM_APCS); - setLibcallImplCallingConv(RTLIB::__gnu_f2h_ieee, CallingConv::ARM_APCS); - } - } - - // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have - // a __gnu_ prefix (which is the default). - if (TT.isTargetAEABI()) { - // FIXME: This does not depend on the subtarget and should go directly into - // RuntimeLibcalls. This is only here because of missing support for setting - // the calling convention of an implementation. - static const struct { - const RTLIB::Libcall Op; - const RTLIB::LibcallImpl Impl; - } LibraryCalls[] = { - {RTLIB::FPROUND_F32_F16, RTLIB::__aeabi_f2h}, - {RTLIB::FPEXT_F16_F32, RTLIB::__aeabi_h2f}, - }; - - for (const auto &LC : LibraryCalls) { - setLibcallImpl(LC.Op, LC.Impl); - } - } else if (!TT.isOSBinFormatMachO()) { - setLibcallImpl(RTLIB::FPROUND_F32_F16, RTLIB::__gnu_f2h_ieee); - setLibcallImpl(RTLIB::FPEXT_F16_F32, RTLIB::__gnu_h2f_ieee); - } - if (Subtarget->isThumb1Only()) addRegisterClass(MVT::i32, &ARM::tGPRRegClass); else @@ -802,6 +641,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::BSWAP, VT, Expand); } + if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps()) + setOperationAction(ISD::SCMP, MVT::i32, Custom); + + if (!Subtarget->hasV8_1MMainlineOps()) + setOperationAction(ISD::UCMP, MVT::i32, Custom); + setOperationAction(ISD::ConstantFP, MVT::f32, Custom); setOperationAction(ISD::ConstantFP, MVT::f64, Custom); @@ -1634,6 +1479,10 @@ bool ARMTargetLowering::useSoftFloat() const { return Subtarget->useSoftFloat(); } +bool ARMTargetLowering::shouldExpandCmpUsingSelects(EVT VT) const { + return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32; +} + // FIXME: It might make sense to define the representative register class as the // nearest super-register that has a non-null superset. For example, DPR_VFP2 is // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, @@ -3769,10 +3618,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, // call __tls_get_addr. ArgListTy Args; - ArgListEntry Entry; - Entry.Node = Argument; - Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); - Args.push_back(Entry); + Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext())); // FIXME: is there useful debug info available here? TargetLowering::CallLoweringInfo CLI(DAG); @@ -7396,7 +7242,7 @@ static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { return false; unsigned NumElts = VT.getVectorNumElements(); - if (M.size() != NumElts && M.size() != NumElts*2) + if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0) return false; // If the mask is twice as long as the input vector then we need to check the @@ -7428,7 +7274,7 @@ static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - if (M.size() != NumElts && M.size() != NumElts*2) + if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { @@ -7531,7 +7377,7 @@ static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { return false; unsigned NumElts = VT.getVectorNumElements(); - if (M.size() != NumElts && M.size() != NumElts*2) + if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { @@ -7564,7 +7410,7 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - if (M.size() != NumElts && M.size() != NumElts*2) + if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { @@ -9991,9 +9837,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); - ArgListEntry Entry; - Entry.Node = SRet; - Entry.Ty = PointerType::getUnqual(RetTy->getContext()); + ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext())); Entry.IsSExt = false; Entry.IsZExt = false; Entry.IsSRet = true; @@ -10001,12 +9845,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { RetTy = Type::getVoidTy(*DAG.getContext()); } - ArgListEntry Entry; - Entry.Node = Arg; - Entry.Ty = ArgTy; - Entry.IsSExt = false; - Entry.IsZExt = false; - Args.push_back(Entry); + Args.emplace_back(Arg, ArgTy); RTLIB::Libcall LC = (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; @@ -10059,10 +9898,9 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, ARMTargetLowering::ArgListTy Args; for (auto AI : {1, 0}) { - ArgListEntry Arg; - Arg.Node = Op.getOperand(AI); - Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); - Args.push_back(Arg); + SDValue Operand = Op.getOperand(AI); + Args.emplace_back(Operand, + Operand.getValueType().getTypeForEVT(*DAG.getContext())); } CallLoweringInfo CLI(DAG); @@ -10612,6 +10450,133 @@ SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op, return DAG.getBitcast(MVT::i32, Res); } +SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + // Determine if this is signed or unsigned comparison + bool IsSigned = (Op.getOpcode() == ISD::SCMP); + + // Special case for Thumb1 UCMP only + if (!IsSigned && Subtarget->isThumb1Only()) { + // For Thumb unsigned comparison, use this sequence: + // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags + // sbc r2, r2 ; r2 = r2 - r2 - !carry + // cmp r1, r0 ; compare RHS with LHS + // sbc r1, r1 ; r1 = r1 - r1 - !carry + // subs r0, r2, r1 ; r0 = r2 - r1 (final result) + + // First subtraction: LHS - RHS + SDValue Sub1WithFlags = DAG.getNode( + ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); + SDValue Sub1Result = Sub1WithFlags.getValue(0); + SDValue Flags1 = Sub1WithFlags.getValue(1); + + // SUBE: Sub1Result - Sub1Result - !carry + // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned) + SDValue Sbc1 = + DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), + Sub1Result, Sub1Result, Flags1); + SDValue Sbc1Result = Sbc1.getValue(0); + + // Second comparison: RHS vs LHS (reverse comparison) + SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS); + + // SUBE: RHS - RHS - !carry + // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned) + SDValue Sbc2 = DAG.getNode( + ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags); + SDValue Sbc2Result = Sbc2.getValue(0); + + // Final subtraction: Sbc1Result - Sbc2Result (no flags needed) + SDValue Result = + DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result); + if (Op.getValueType() != MVT::i32) + Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType()); + + return Result; + } + + // For the ARM assembly pattern: + // subs r0, r0, r1 ; subtract RHS from LHS and set flags + // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for + // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for + // signed, LO for unsigned) + // ; if LHS == RHS, result remains 0 from the subs + + // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC + unsigned Opcode = ARMISD::SUBC; + + // Check if RHS is a subtraction against 0: (0 - X) + if (RHS.getOpcode() == ISD::SUB) { + SDValue SubLHS = RHS.getOperand(0); + SDValue SubRHS = RHS.getOperand(1); + + // Check if it's 0 - X + if (isNullConstant(SubLHS)) { + bool CanUseAdd = false; + if (IsSigned) { + // For SCMP: only if X is known to never be INT_MIN (to avoid overflow) + if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS) + .getSignedMinValue() + .isMinSignedValue()) { + CanUseAdd = true; + } + } else { + // For UCMP: only if X is known to never be zero + if (DAG.isKnownNeverZero(SubRHS)) { + CanUseAdd = true; + } + } + + if (CanUseAdd) { + Opcode = ARMISD::ADDC; + RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of + // LHS - (0 - X) + } + } + } + + // Generate the operation with flags + SDValue OpWithFlags; + if (Opcode == ARMISD::ADDC) { + // Use ADDC: LHS + RHS (where RHS was 0 - X, now X) + OpWithFlags = DAG.getNode(ARMISD::ADDC, dl, + DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); + } else { + // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags) + OpWithFlags = DAG.getNode(ARMISD::SUBC, dl, + DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); + } + + SDValue OpResult = OpWithFlags.getValue(0); // The operation result + SDValue Flags = OpWithFlags.getValue(1); // The flags + + // Constants for conditional moves + SDValue One = DAG.getConstant(1, dl, MVT::i32); + SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32); + + // Select condition codes based on signed vs unsigned + ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI; + ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO; + + // First conditional move: if greater than, set to 1 + SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32); + SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One, + GTCondValue, Flags); + + // Second conditional move: if less than, set to -1 + SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32); + SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne, + LTCondValue, Flags); + + if (Op.getValueType() != MVT::i32) + Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType()); + + return Result2; +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); switch (Op.getOpcode()) { @@ -10740,6 +10705,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG); case ARMISD::WIN__DBZCHK: return SDValue(); + case ISD::UCMP: + case ISD::SCMP: + return LowerCMP(Op, DAG); } } @@ -20627,12 +20595,10 @@ static TargetLowering::ArgListTy getDivRemArgList( bool isSigned = N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::SREM; TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { EVT ArgVT = N->getOperand(i).getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*Context); - Entry.Node = N->getOperand(i); - Entry.Ty = ArgTy; + TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy); Entry.IsSExt = isSigned; Entry.IsZExt = !isSigned; Args.push_back(Entry); @@ -21605,7 +21571,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, - ArrayRef<unsigned> Indices, unsigned Factor) const { + ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); @@ -21615,7 +21581,7 @@ bool ARMTargetLowering::lowerInterleavedLoad( auto *LI = dyn_cast<LoadInst>(Load); if (!LI) return false; - assert(!Mask && "Unexpected mask on a load"); + assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load"); auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType()); Type *EltTy = VecTy->getElementType(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 825145d..778595e 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -607,6 +607,8 @@ class VectorType; bool preferZeroCompareBranch() const override { return true; } + bool shouldExpandCmpUsingSelects(EVT VT) const override; + bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; bool hasAndNotCompare(SDValue V) const override { @@ -683,8 +685,8 @@ class VectorType; bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, - ArrayRef<unsigned> Indices, - unsigned Factor) const override; + ArrayRef<unsigned> Indices, unsigned Factor, + const APInt &GapMask) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor) const override; @@ -904,6 +906,7 @@ class VectorType; void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const; SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const; Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp index b4677a8..ebfa593 100644 --- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -89,19 +89,15 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( AlignVariant = ALIGN1; TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); - Entry.Node = Dst; - Args.push_back(Entry); + Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Args.emplace_back(Dst, IntPtrTy); if (AEABILibcall == AEABI_MEMCLR) { - Entry.Node = Size; - Args.push_back(Entry); + Args.emplace_back(Size, IntPtrTy); } else if (AEABILibcall == AEABI_MEMSET) { // Adjust parameters for memset, EABI uses format (ptr, size, value), // GNU library uses (ptr, value, size) // See RTABI section 4.3.4 - Entry.Node = Size; - Args.push_back(Entry); + Args.emplace_back(Size, IntPtrTy); // Extend or truncate the argument to be an i32 value for the call. if (Src.getValueType().bitsGT(MVT::i32)) @@ -109,16 +105,13 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( else if (Src.getValueType().bitsLT(MVT::i32)) Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src); - Entry.Node = Src; - Entry.Ty = Type::getInt32Ty(*DAG.getContext()); + TargetLowering::ArgListEntry Entry(Src, + Type::getInt32Ty(*DAG.getContext())); Entry.IsSExt = false; Args.push_back(Entry); } else { - Entry.Node = Src; - Args.push_back(Entry); - - Entry.Node = Size; - Args.push_back(Entry); + Args.emplace_back(Src, IntPtrTy); + Args.emplace_back(Size, IntPtrTy); } static const RTLIB::Libcall FunctionImpls[4][3] = { diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 6f37eca..6b28541 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1084,9 +1084,10 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost( CostKind, Op1Info, Op2Info, I); } -InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty, - ScalarEvolution *SE, - const SCEV *Ptr) const { +InstructionCost +ARMTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, + const SCEV *Ptr, + TTI::TargetCostKind CostKind) const { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -1095,7 +1096,7 @@ InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty, int MaxMergeDistance = 64; if (ST->hasNEON()) { - if (Ty->isVectorTy() && SE && + if (PtrTy->isVectorTy() && SE && !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) return NumVectorInstToHideOverhead; @@ -1103,7 +1104,7 @@ InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty, // addressing mode. return 1; } - return BaseT::getAddressComputationCost(Ty, SE, Ptr); + return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind); } bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) const { @@ -1335,6 +1336,39 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, if (!Mask.empty()) { std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy); + // Check for LD2/LD4 instructions, which are represented in llvm IR as + // deinterleaving-shuffle(load). The shuffle cost could potentially be + // free, but we model it with a cost of LT.first so that LD2/LD4 have a + // higher cost than just the load. + if (Args.size() >= 1 && isa<LoadInst>(Args[0]) && + (LT.second.getScalarSizeInBits() == 8 || + LT.second.getScalarSizeInBits() == 16 || + LT.second.getScalarSizeInBits() == 32) && + LT.second.getSizeInBits() == 128 && + ((TLI->getMaxSupportedInterleaveFactor() >= 2 && + ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 2)) || + (TLI->getMaxSupportedInterleaveFactor() == 4 && + ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4)))) + return ST->getMVEVectorCostFactor(CostKind) * + std::max<InstructionCost>(1, LT.first / 4); + + // Check for ST2/ST4 instructions, which are represented in llvm IR as + // store(interleaving-shuffle). The shuffle cost could potentially be + // free, but we model it with a cost of LT.first so that ST2/ST4 have a + // higher cost than just the store. + if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) && + (LT.second.getScalarSizeInBits() == 8 || + LT.second.getScalarSizeInBits() == 16 || + LT.second.getScalarSizeInBits() == 32) && + LT.second.getSizeInBits() == 128 && + ((TLI->getMaxSupportedInterleaveFactor() >= 2 && + ShuffleVectorInst::isInterleaveMask( + Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) || + (TLI->getMaxSupportedInterleaveFactor() == 4 && + ShuffleVectorInst::isInterleaveMask( + Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2)))) + return ST->getMVEVectorCostFactor(CostKind) * LT.first; + if (LT.second.isVector() && Mask.size() <= LT.second.getVectorNumElements() && (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) || diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 522c235..cdd8bcb 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -257,8 +257,9 @@ public: unsigned Index, const Value *Op0, const Value *Op1) const override; - InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, - const SCEV *Ptr) const override; + InstructionCost + getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr, + TTI::TargetCostKind CostKind) const override; InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index ece6c10..0e97483 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -3373,12 +3373,12 @@ public: void addMSRMaskOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createImm(unsigned(getMSRMask()))); + Inst.addOperand(MCOperand::createImm(getMSRMask())); } void addBankedRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createImm(unsigned(getBankedReg()))); + Inst.addOperand(MCOperand::createImm(getBankedReg())); } void addProcIFlagsOperands(MCInst &Inst, unsigned N) const { diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 8ee3a2d..a5266a9 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -20,7 +20,6 @@ #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSymbolMachO.h" #include "llvm/MC/MCValue.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp index 0b4e7df..5eeb4fe 100644 --- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -922,7 +922,7 @@ bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB, // the function. unsigned LastVPTImm = 0; Register LastVPTReg = 0; - SmallSet<MachineInstr *, 4> DeadInstructions; + SmallPtrSet<MachineInstr *, 4> DeadInstructions; for (MachineInstr &Instr : MBB.instrs()) { // Look for predicated MVE instructions. diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp index 25ad9ec..545bc3a 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -505,10 +505,9 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; for (SDValue const &Value : Op->op_values()) { - Entry.Node = Value; - Entry.Ty = Value.getValueType().getTypeForEVT(*DAG.getContext()); + TargetLowering::ArgListEntry Entry( + Value, Value.getValueType().getTypeForEVT(*DAG.getContext())); Entry.IsSExt = IsSigned; Entry.IsZExt = !IsSigned; Args.push_back(Entry); diff --git a/llvm/lib/Target/AVR/AVRISelLowering.h b/llvm/lib/Target/AVR/AVRISelLowering.h index 2ae22b2..301ce9c 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.h +++ b/llvm/lib/Target/AVR/AVRISelLowering.h @@ -94,6 +94,8 @@ public: return ShiftLegalizationStrategy::LowerToLibcall; } + bool softPromoteHalfType() const override { return true; } + private: SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc, SelectionDAG &DAG, SDLoc dl) const; diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp index b75417a..fbd1484 100644 --- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp +++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp @@ -20,6 +20,7 @@ #include "AVR.h" #include "AVRMachineFunctionInfo.h" #include "AVRTargetObjectFile.h" +#include "AVRTargetTransformInfo.h" #include "MCTargetDesc/AVRMCTargetDesc.h" #include "TargetInfo/AVRTargetInfo.h" @@ -28,7 +29,7 @@ namespace llvm { static const char *AVRDataLayout = - "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8"; + "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8:16-a:8"; /// Processes a CPU name. static StringRef getCPU(StringRef CPU) { @@ -62,7 +63,9 @@ namespace { class AVRPassConfig : public TargetPassConfig { public: AVRPassConfig(AVRTargetMachine &TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + : TargetPassConfig(TM, PM) { + EnableLoopTermFold = true; + } AVRTargetMachine &getAVRTargetMachine() const { return getTM<AVRTargetMachine>(); @@ -107,6 +110,11 @@ const AVRSubtarget *AVRTargetMachine::getSubtargetImpl(const Function &) const { return &SubTarget; } +TargetTransformInfo +AVRTargetMachine::getTargetTransformInfo(const Function &F) const { + return TargetTransformInfo(std::make_unique<AVRTTIImpl>(this, F)); +} + MachineFunctionInfo *AVRTargetMachine::createMachineFunctionInfo( BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const { diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.h b/llvm/lib/Target/AVR/AVRTargetMachine.h index 167d007..9452b3d 100644 --- a/llvm/lib/Target/AVR/AVRTargetMachine.h +++ b/llvm/lib/Target/AVR/AVRTargetMachine.h @@ -48,6 +48,8 @@ public: createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override; + TargetTransformInfo getTargetTransformInfo(const Function &F) const override; + bool isNoopAddrSpaceCast(unsigned SrcAs, unsigned DestAs) const override { // While AVR has different address spaces, they are all represented by // 16-bit pointers that can be freely casted between (of course, a pointer diff --git a/llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp b/llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp new file mode 100644 index 0000000..b1ef380 --- /dev/null +++ b/llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp @@ -0,0 +1,24 @@ +//===-- AVRTargetTransformInfo.cpp - AVR specific TTI ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AVRTargetTransformInfo.h" + +using namespace llvm; + +bool AVRTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, + const TargetTransformInfo::LSRCost &C2) const { + // AVR specific here are "instruction number 1st priority". + // If we need to emit adds inside the loop to add up base registers, then + // we need at least one extra temporary register. + unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0); + unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0); + return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost, C1.NumIVMuls, + C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) < + std::tie(C2.Insns, C2NumRegs, C2.AddRecCost, C2.NumIVMuls, + C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost); +} diff --git a/llvm/lib/Target/AVR/AVRTargetTransformInfo.h b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h new file mode 100644 index 0000000..0daeeb8 --- /dev/null +++ b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h @@ -0,0 +1,51 @@ +//===- AVRTargetTransformInfo.h - AVR specific TTI --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file defines a TargetTransformInfoImplBase conforming object specific +/// to the AVR target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AVR_AVRTARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_AVR_AVRTARGETTRANSFORMINFO_H + +#include "AVRSubtarget.h" +#include "AVRTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/IR/Function.h" +#include <optional> + +namespace llvm { + +class AVRTTIImpl final : public BasicTTIImplBase<AVRTTIImpl> { + using BaseT = BasicTTIImplBase<AVRTTIImpl>; + using TTI = TargetTransformInfo; + + friend BaseT; + + const AVRSubtarget *ST; + const AVRTargetLowering *TLI; + + const AVRSubtarget *getST() const { return ST; } + const AVRTargetLowering *getTLI() const { return TLI; } + +public: + explicit AVRTTIImpl(const AVRTargetMachine *TM, const Function &F) + : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} + + bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, + const TargetTransformInfo::LSRCost &C2) const override; +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AVR_AVRTARGETTRANSFORMINFO_H diff --git a/llvm/lib/Target/AVR/CMakeLists.txt b/llvm/lib/Target/AVR/CMakeLists.txt index 781dac0..a31c545 100644 --- a/llvm/lib/Target/AVR/CMakeLists.txt +++ b/llvm/lib/Target/AVR/CMakeLists.txt @@ -29,11 +29,13 @@ add_llvm_target(AVRCodeGen AVRSubtarget.cpp AVRTargetMachine.cpp AVRTargetObjectFile.cpp + AVRTargetTransformInfo.cpp DEPENDS intrinsics_gen LINK_COMPONENTS + Analysis AVRDesc AVRInfo AsmPrinter @@ -44,6 +46,8 @@ add_llvm_target(AVRCodeGen SelectionDAG Support Target + TargetParser + TransformUtils ADD_TO_COMPONENT AVR diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp index e55d9b2..7885d93 100644 --- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp +++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp @@ -116,7 +116,7 @@ class CSKYConstantIslands : public MachineFunctionPass { /// NewWaterList - The subset of WaterList that was created since the /// previous iteration by inserting unconditional branches. - SmallSet<MachineBasicBlock *, 4> NewWaterList; + SmallPtrSet<MachineBasicBlock *, 4> NewWaterList; using water_iterator = std::vector<MachineBasicBlock *>::iterator; diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp index 7070171..e5b4f6e 100644 --- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp +++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp @@ -1329,10 +1329,7 @@ SDValue CSKYTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N, // Prepare argument list to generate call. ArgListTy Args; - ArgListEntry Entry; - Entry.Node = Load; - Entry.Ty = CallTy; - Args.push_back(Entry); + Args.emplace_back(Load, CallTy); // Setup call to __tls_get_addr. TargetLowering::CallLoweringInfo CLI(DAG); diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt index c7c09ca..8100f94 100644 --- a/llvm/lib/Target/DirectX/CMakeLists.txt +++ b/llvm/lib/Target/DirectX/CMakeLists.txt @@ -49,6 +49,7 @@ add_llvm_target(DirectXCodeGen DirectXInfo DirectXPointerTypeAnalysis FrontendHLSL + IPO MC ScalarOpts SelectionDAG diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 26a113d..a1ef257 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -162,8 +162,7 @@ void DXContainerGlobals::addRootSignature(Module &M, auto &RSA = getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo(); const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; - const std::optional<mcdxbc::RootSignatureDesc> &RS = - RSA.getDescForFunction(EntryFunction); + const mcdxbc::RootSignatureDesc *RS = RSA.getDescForFunction(EntryFunction); if (!RS) return; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 492e078..c65ead4 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -1108,11 +1108,11 @@ def RawBufferStore : DXILOp<140, rawBufferStore> { def Dot2AddHalf : DXILOp<162, dot2AddHalf> { let Doc = "2D half dot product with accumulate to float"; let intrinsics = [IntrinSelect<int_dx_dot2add>]; - let arguments = [FloatTy, HalfTy, HalfTy, HalfTy, HalfTy]; - let result = FloatTy; - let overloads = [Overloads<DXIL1_0, []>]; - let stages = [Stages<DXIL1_0, [all_stages]>]; - let attributes = [Attributes<DXIL1_0, [ReadNone]>]; + let arguments = [OverloadTy, HalfTy, HalfTy, HalfTy, HalfTy]; + let result = OverloadTy; + let overloads = [Overloads<DXIL1_4, [FloatTy]>]; + let stages = [Stages<DXIL1_4, [all_stages]>]; + let attributes = [Attributes<DXIL1_4, [ReadNone]>]; } def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> { diff --git a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp index 5f331db..13e3408 100644 --- a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp +++ b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp @@ -20,13 +20,13 @@ using namespace llvm; static bool finalizeLinkage(Module &M) { bool MadeChange = false; - // Convert private global variables to internal linkage. - for (GlobalVariable &GV : M.globals()) { - if (GV.hasPrivateLinkage()) { + // Convert private globals and external globals with no usage to internal + // linkage. + for (GlobalVariable &GV : M.globals()) + if (GV.hasPrivateLinkage() || (GV.hasExternalLinkage() && GV.use_empty())) { GV.setLinkage(GlobalValue::InternalLinkage); MadeChange = true; } - } SmallVector<Function *> Funcs; diff --git a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp index 306db6a..695eacb 100644 --- a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp +++ b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp @@ -9,10 +9,13 @@ #include "DXILForwardHandleAccesses.h" #include "DXILShaderFlags.h" #include "DirectX.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/DXILResource.h" #include "llvm/Analysis/Loads.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsDirectX.h" @@ -70,6 +73,7 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) { DenseMap<GlobalVariable *, IntrinsicInst *> HandleMap; SmallVector<LoadInst *> LoadsToProcess; + DenseMap<AllocaInst *, SmallVector<IntrinsicInst *>> LifeTimeIntrinsicMap; for (BasicBlock &BB : F) for (Instruction &Inst : BB) if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) { @@ -78,6 +82,14 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) { case Intrinsic::dx_resource_handlefromimplicitbinding: processHandle(II, HandleMap); break; + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + if (II->arg_size() >= 1) { + Value *Ptr = II->getArgOperand(0); + if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) + LifeTimeIntrinsicMap[Alloca].push_back(II); + } + break; default: continue; } @@ -98,8 +110,16 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) { NestedLI, NestedLI->getParent(), BBI, 0, nullptr, nullptr); GV = dyn_cast_or_null<GlobalVariable>(Loaded); } else if (auto *NestedAlloca = dyn_cast<AllocaInst>(V)) { - for (auto &Use : NestedAlloca->uses()) { - auto *Store = dyn_cast<StoreInst>(Use.getUser()); + + if (auto It = LifeTimeIntrinsicMap.find(NestedAlloca); + It != LifeTimeIntrinsicMap.end()) { + llvm::for_each(It->second, + [](IntrinsicInst *II) { II->eraseFromParent(); }); + LifeTimeIntrinsicMap.erase(It); + } + + for (auto *User : NestedAlloca->users()) { + auto *Store = dyn_cast<StoreInst>(User); if (!Store) continue; diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index 0ec15a6..bd421771 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -9,6 +9,7 @@ #include "DXILOpLowering.h" #include "DXILConstants.h" #include "DXILOpBuilder.h" +#include "DXILRootSignature.h" #include "DXILShaderFlags.h" #include "DirectX.h" #include "llvm/ADT/SmallVector.h" @@ -746,7 +747,7 @@ public: IRBuilder<> &IRB = OpBuilder.getIRB(); return replaceFunction(F, [&](CallInst *CI) -> Error { IRB.SetInsertPoint(CI); - Value *Ptr = CI->getArgOperand(1); + Value *Ptr = CI->getArgOperand(0); assert(Ptr->getType()->isPointerTy() && "Expected operand of lifetime intrinsic to be a pointer"); @@ -918,6 +919,7 @@ PreservedAnalyses DXILOpLowering::run(Module &M, ModuleAnalysisManager &MAM) { PA.preserve<DXILResourceAnalysis>(); PA.preserve<DXILMetadataAnalysis>(); PA.preserve<ShaderFlagsAnalysis>(); + PA.preserve<RootSignatureAnalysis>(); return PA; } @@ -945,6 +947,7 @@ public: AU.addPreserved<DXILResourceWrapperPass>(); AU.addPreserved<DXILMetadataAnalysisWrapperPass>(); AU.addPreserved<ShaderFlagsAnalysisWrapper>(); + AU.addPreserved<RootSignatureAnalysisWrapper>(); } }; char DXILOpLoweringLegacy::ID = 0; diff --git a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp index 398dcbb..be2c7d1 100644 --- a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp +++ b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "DXILPostOptimizationValidation.h" +#include "DXILRootSignature.h" #include "DXILShaderFlags.h" #include "DirectX.h" #include "llvm/ADT/SmallString.h" @@ -17,13 +18,44 @@ #include "llvm/IR/IntrinsicsDirectX.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" +#include "llvm/Support/DXILABI.h" #define DEBUG_TYPE "dxil-post-optimization-validation" using namespace llvm; using namespace llvm::dxil; -namespace { +static ResourceClass toResourceClass(dxbc::DescriptorRangeType RangeType) { + using namespace dxbc; + switch (RangeType) { + case DescriptorRangeType::SRV: + return ResourceClass::SRV; + case DescriptorRangeType::UAV: + return ResourceClass::UAV; + case DescriptorRangeType::CBV: + return ResourceClass::CBuffer; + case DescriptorRangeType::Sampler: + return ResourceClass::Sampler; + } + llvm_unreachable("Unknown DescriptorRangeType"); +} + +static ResourceClass toResourceClass(dxbc::RootParameterType Type) { + using namespace dxbc; + switch (Type) { + case RootParameterType::Constants32Bit: + return ResourceClass::CBuffer; + case RootParameterType::SRV: + return ResourceClass::SRV; + case RootParameterType::UAV: + return ResourceClass::UAV; + case RootParameterType::CBV: + return ResourceClass::CBuffer; + case dxbc::RootParameterType::DescriptorTable: + llvm_unreachable("DescriptorTable is not convertible to ResourceClass"); + } + llvm_unreachable("Unknown RootParameterType"); +} static void reportInvalidDirection(Module &M, DXILResourceMap &DRM) { for (const auto &UAV : DRM.uavs()) { @@ -63,9 +95,7 @@ static void reportOverlappingError(Module &M, ResourceInfo R1, } static void reportOverlappingBinding(Module &M, DXILResourceMap &DRM) { - if (DRM.empty()) - return; - + bool ErrorFound = false; for (const auto &ResList : {DRM.srvs(), DRM.uavs(), DRM.cbuffers(), DRM.samplers()}) { if (ResList.empty()) @@ -77,15 +107,136 @@ static void reportOverlappingBinding(Module &M, DXILResourceMap &DRM) { while (RI != ResList.end() && PrevRI->getBinding().overlapsWith(RI->getBinding())) { reportOverlappingError(M, *PrevRI, *RI); + ErrorFound = true; RI++; } PrevRI = CurrentRI; } } + assert(ErrorFound && "this function should be called only when if " + "DXILResourceBindingInfo::hasOverlapingBinding() is " + "true, yet no overlapping binding was found"); +} + +static void +reportOverlappingRegisters(Module &M, + const llvm::hlsl::BindingInfoBuilder::Binding &R1, + const llvm::hlsl::BindingInfoBuilder::Binding &R2) { + SmallString<128> Message; + + raw_svector_ostream OS(Message); + OS << "resource " << getResourceClassName(R1.RC) << " (space=" << R1.Space + << ", registers=[" << R1.LowerBound << ", " << R1.UpperBound + << "]) overlaps with resource " << getResourceClassName(R2.RC) + << " (space=" << R2.Space << ", registers=[" << R2.LowerBound << ", " + << R2.UpperBound << "])"; + M.getContext().diagnose(DiagnosticInfoGeneric(Message)); +} + +static dxbc::ShaderVisibility +tripleToVisibility(llvm::Triple::EnvironmentType ET) { + switch (ET) { + case Triple::Pixel: + return dxbc::ShaderVisibility::Pixel; + case Triple::Vertex: + return dxbc::ShaderVisibility::Vertex; + case Triple::Geometry: + return dxbc::ShaderVisibility::Geometry; + case Triple::Hull: + return dxbc::ShaderVisibility::Hull; + case Triple::Domain: + return dxbc::ShaderVisibility::Domain; + case Triple::Mesh: + return dxbc::ShaderVisibility::Mesh; + case Triple::Compute: + return dxbc::ShaderVisibility::All; + default: + llvm_unreachable("Invalid triple to shader stage conversion"); + } +} + +static void validateRootSignature(Module &M, + const mcdxbc::RootSignatureDesc &RSD, + dxil::ModuleMetadataInfo &MMI) { + + hlsl::BindingInfoBuilder Builder; + dxbc::ShaderVisibility Visibility = tripleToVisibility(MMI.ShaderProfile); + + for (const mcdxbc::RootParameterInfo &ParamInfo : RSD.ParametersContainer) { + dxbc::ShaderVisibility ParamVisibility = + static_cast<dxbc::ShaderVisibility>(ParamInfo.Header.ShaderVisibility); + if (ParamVisibility != dxbc::ShaderVisibility::All && + ParamVisibility != Visibility) + continue; + dxbc::RootParameterType ParamType = + static_cast<dxbc::RootParameterType>(ParamInfo.Header.ParameterType); + switch (ParamType) { + case dxbc::RootParameterType::Constants32Bit: { + dxbc::RTS0::v1::RootConstants Const = + RSD.ParametersContainer.getConstant(ParamInfo.Location); + Builder.trackBinding(dxil::ResourceClass::CBuffer, Const.RegisterSpace, + Const.ShaderRegister, Const.ShaderRegister, + &ParamInfo); + break; + } + + case dxbc::RootParameterType::SRV: + case dxbc::RootParameterType::UAV: + case dxbc::RootParameterType::CBV: { + dxbc::RTS0::v2::RootDescriptor Desc = + RSD.ParametersContainer.getRootDescriptor(ParamInfo.Location); + Builder.trackBinding(toResourceClass(static_cast<dxbc::RootParameterType>( + ParamInfo.Header.ParameterType)), + Desc.RegisterSpace, Desc.ShaderRegister, + Desc.ShaderRegister, &ParamInfo); + + break; + } + case dxbc::RootParameterType::DescriptorTable: { + const mcdxbc::DescriptorTable &Table = + RSD.ParametersContainer.getDescriptorTable(ParamInfo.Location); + + for (const dxbc::RTS0::v2::DescriptorRange &Range : Table.Ranges) { + uint32_t UpperBound = + Range.NumDescriptors == ~0U + ? Range.BaseShaderRegister + : Range.BaseShaderRegister + Range.NumDescriptors - 1; + Builder.trackBinding( + toResourceClass( + static_cast<dxbc::DescriptorRangeType>(Range.RangeType)), + Range.RegisterSpace, Range.BaseShaderRegister, UpperBound, + &ParamInfo); + } + break; + } + } + } + + for (const dxbc::RTS0::v1::StaticSampler &S : RSD.StaticSamplers) + Builder.trackBinding(dxil::ResourceClass::Sampler, S.RegisterSpace, + S.ShaderRegister, S.ShaderRegister, &S); + + Builder.calculateBindingInfo( + [&M](const llvm::hlsl::BindingInfoBuilder &Builder, + const llvm::hlsl::BindingInfoBuilder::Binding &ReportedBinding) { + const llvm::hlsl::BindingInfoBuilder::Binding &Overlaping = + Builder.findOverlapping(ReportedBinding); + reportOverlappingRegisters(M, ReportedBinding, Overlaping); + }); +} + +static mcdxbc::RootSignatureDesc * +getRootSignature(RootSignatureBindingInfo &RSBI, + dxil::ModuleMetadataInfo &MMI) { + if (MMI.EntryPropertyVec.size() == 0) + return nullptr; + return RSBI.getDescForFunction(MMI.EntryPropertyVec[0].Entry); } static void reportErrors(Module &M, DXILResourceMap &DRM, - DXILResourceBindingInfo &DRBI) { + DXILResourceBindingInfo &DRBI, + RootSignatureBindingInfo &RSBI, + dxil::ModuleMetadataInfo &MMI) { if (DRM.hasInvalidCounterDirection()) reportInvalidDirection(M, DRM); @@ -94,14 +245,19 @@ static void reportErrors(Module &M, DXILResourceMap &DRM, assert(!DRBI.hasImplicitBinding() && "implicit bindings should be handled in " "DXILResourceImplicitBinding pass"); + + if (mcdxbc::RootSignatureDesc *RSD = getRootSignature(RSBI, MMI)) + validateRootSignature(M, *RSD, MMI); } -} // namespace PreservedAnalyses DXILPostOptimizationValidation::run(Module &M, ModuleAnalysisManager &MAM) { DXILResourceMap &DRM = MAM.getResult<DXILResourceAnalysis>(M); DXILResourceBindingInfo &DRBI = MAM.getResult<DXILResourceBindingAnalysis>(M); - reportErrors(M, DRM, DRBI); + RootSignatureBindingInfo &RSBI = MAM.getResult<RootSignatureAnalysis>(M); + ModuleMetadataInfo &MMI = MAM.getResult<DXILMetadataAnalysis>(M); + + reportErrors(M, DRM, DRBI, RSBI, MMI); return PreservedAnalyses::all(); } @@ -113,7 +269,12 @@ public: getAnalysis<DXILResourceWrapperPass>().getResourceMap(); DXILResourceBindingInfo &DRBI = getAnalysis<DXILResourceBindingWrapperPass>().getBindingInfo(); - reportErrors(M, DRM, DRBI); + RootSignatureBindingInfo &RSBI = + getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo(); + dxil::ModuleMetadataInfo &MMI = + getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata(); + + reportErrors(M, DRM, DRBI, RSBI, MMI); return false; } StringRef getPassName() const override { @@ -125,10 +286,13 @@ public: void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { AU.addRequired<DXILResourceWrapperPass>(); AU.addRequired<DXILResourceBindingWrapperPass>(); + AU.addRequired<DXILMetadataAnalysisWrapperPass>(); + AU.addRequired<RootSignatureAnalysisWrapper>(); AU.addPreserved<DXILResourceWrapperPass>(); AU.addPreserved<DXILResourceBindingWrapperPass>(); AU.addPreserved<DXILMetadataAnalysisWrapperPass>(); AU.addPreserved<ShaderFlagsAnalysisWrapper>(); + AU.addPreserved<RootSignatureAnalysisWrapper>(); } }; char DXILPostOptimizationValidationLegacy::ID = 0; @@ -139,6 +303,8 @@ INITIALIZE_PASS_BEGIN(DXILPostOptimizationValidationLegacy, DEBUG_TYPE, INITIALIZE_PASS_DEPENDENCY(DXILResourceBindingWrapperPass) INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass) INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass) +INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper) INITIALIZE_PASS_END(DXILPostOptimizationValidationLegacy, DEBUG_TYPE, "DXIL Post Optimization Validation", false, false) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 254b7ff..b990b6c 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -43,13 +43,11 @@ public: iterator end() { return FuncToRsMap.end(); } - std::optional<mcdxbc::RootSignatureDesc> - getDescForFunction(const Function *F) { + mcdxbc::RootSignatureDesc *getDescForFunction(const Function *F) { const auto FuncRs = find(F); if (FuncRs == end()) - return std::nullopt; - - return FuncRs->second; + return nullptr; + return &FuncRs->second; } }; diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp index 1bd5dd7..1eb03bf 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp @@ -13,11 +13,15 @@ #include "DXILWriterPass.h" #include "DXILBitcodeWriter.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/InitializePasses.h" @@ -54,49 +58,81 @@ public: }; static void legalizeLifetimeIntrinsics(Module &M) { - for (Function &F : M) { - Intrinsic::ID IID = F.getIntrinsicID(); - if (IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end) + LLVMContext &Ctx = M.getContext(); + Type *I64Ty = IntegerType::get(Ctx, 64); + Type *PtrTy = PointerType::get(Ctx, 0); + Intrinsic::ID LifetimeIIDs[2] = {Intrinsic::lifetime_start, + Intrinsic::lifetime_end}; + for (Intrinsic::ID &IID : LifetimeIIDs) { + Function *F = M.getFunction(Intrinsic::getName(IID, {PtrTy}, &M)); + if (!F) continue; - // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr - F.removeFnAttr(Attribute::Memory); - - // Lifetime intrinsics in LLVM 3.7 do not have mangled names - F.setName(Intrinsic::getBaseName(IID)); - - // LLVM 3.7 Lifetime intrinics require an i8* operand, so we insert bitcasts - // to ensure that is the case - for (auto *User : make_early_inc_range(F.users())) { - CallInst *CI = dyn_cast<CallInst>(User); - assert(CI && "Expected user of a lifetime intrinsic function to be a " - "lifetime intrinsic call"); - Value *PtrOperand = CI->getArgOperand(1); - PointerType *PtrTy = cast<PointerType>(PtrOperand->getType()); + // Get or insert an LLVM 3.7-compliant lifetime intrinsic function of the + // form `void @llvm.lifetime.[start/end](i64, ptr)` with the NoUnwind + // attribute + AttributeList Attr; + Attr = Attr.addFnAttribute(Ctx, Attribute::NoUnwind); + FunctionCallee LifetimeCallee = M.getOrInsertFunction( + Intrinsic::getBaseName(IID), Attr, Type::getVoidTy(Ctx), I64Ty, PtrTy); + + // Replace all calls to lifetime intrinsics with calls to the + // LLVM 3.7-compliant version of the lifetime intrinsic + for (User *U : make_early_inc_range(F->users())) { + CallInst *CI = dyn_cast<CallInst>(U); + assert(CI && + "Expected user of a lifetime intrinsic function to be a CallInst"); + + // LLVM 3.7 lifetime intrinics require an i8* operand, so we insert + // a bitcast to ensure that is the case + Value *PtrOperand = CI->getArgOperand(0); + PointerType *PtrOpPtrTy = cast<PointerType>(PtrOperand->getType()); Value *NoOpBitCast = CastInst::Create(Instruction::BitCast, PtrOperand, - PtrTy, "", CI->getIterator()); - CI->setArgOperand(1, NoOpBitCast); + PtrOpPtrTy, "", CI->getIterator()); + + // LLVM 3.7 lifetime intrinsics have an explicit size operand, whose value + // we can obtain from the pointer operand which must be an AllocaInst (as + // of https://github.com/llvm/llvm-project/pull/149310) + AllocaInst *AI = dyn_cast<AllocaInst>(PtrOperand); + assert(AI && + "The pointer operand of a lifetime intrinsic call must be an " + "AllocaInst"); + std::optional<TypeSize> AllocSize = + AI->getAllocationSize(CI->getDataLayout()); + assert(AllocSize.has_value() && + "Expected the allocation size of AllocaInst to be known"); + CallInst *NewCI = CallInst::Create( + LifetimeCallee, + {ConstantInt::get(I64Ty, AllocSize.value().getFixedValue()), + NoOpBitCast}, + "", CI->getIterator()); + for (Attribute ParamAttr : CI->getParamAttributes(0)) + NewCI->addParamAttr(1, ParamAttr); + + CI->eraseFromParent(); } + + F->eraseFromParent(); } } static void removeLifetimeIntrinsics(Module &M) { - for (Function &F : make_early_inc_range(M)) { - if (Intrinsic::ID IID = F.getIntrinsicID(); - IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end) + Intrinsic::ID LifetimeIIDs[2] = {Intrinsic::lifetime_start, + Intrinsic::lifetime_end}; + for (Intrinsic::ID &IID : LifetimeIIDs) { + Function *F = M.getFunction(Intrinsic::getBaseName(IID)); + if (!F) continue; - for (User *U : make_early_inc_range(F.users())) { - LifetimeIntrinsic *LI = dyn_cast<LifetimeIntrinsic>(U); - assert(LI && "Expected user of lifetime intrinsic function to be " - "a LifetimeIntrinsic instruction"); - BitCastInst *BCI = dyn_cast<BitCastInst>(LI->getArgOperand(1)); - assert(BCI && "Expected pointer operand of LifetimeIntrinsic to be a " - "BitCastInst"); - LI->eraseFromParent(); + for (User *U : make_early_inc_range(F->users())) { + CallInst *CI = dyn_cast<CallInst>(U); + assert(CI && "Expected user of lifetime function to be a CallInst"); + BitCastInst *BCI = dyn_cast<BitCastInst>(CI->getArgOperand(1)); + assert(BCI && "Expected pointer operand of CallInst to be a BitCastInst"); + CI->eraseFromParent(); BCI->eraseFromParent(); } - F.eraseFromParent(); + F->eraseFromParent(); } } diff --git a/llvm/lib/Target/DirectX/DirectXPassRegistry.def b/llvm/lib/Target/DirectX/DirectXPassRegistry.def index d506954..b4b48a16 100644 --- a/llvm/lib/Target/DirectX/DirectXPassRegistry.def +++ b/llvm/lib/Target/DirectX/DirectXPassRegistry.def @@ -24,6 +24,7 @@ MODULE_ANALYSIS("dxil-root-signature-analysis", dxil::RootSignatureAnalysis()) #define MODULE_PASS(NAME, CREATE_PASS) #endif MODULE_PASS("dxil-cbuffer-access", DXILCBufferAccess()) +MODULE_PASS("dxil-finalize-linkage", DXILFinalizeLinkage()) MODULE_PASS("dxil-data-scalarization", DXILDataScalarization()) MODULE_PASS("dxil-flatten-arrays", DXILFlattenArrays()) MODULE_PASS("dxil-intrinsic-expansion", DXILIntrinsicExpansion()) diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp index 84751d2..f5d5a73 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp @@ -14,6 +14,7 @@ #include "DirectXTargetMachine.h" #include "DXILCBufferAccess.h" #include "DXILDataScalarization.h" +#include "DXILFinalizeLinkage.h" #include "DXILFlattenArrays.h" #include "DXILForwardHandleAccesses.h" #include "DXILIntrinsicExpansion.h" @@ -45,6 +46,8 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Transforms/IPO/GlobalDCE.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/Scalarizer.h" #include <optional> @@ -62,6 +65,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() { initializeEmbedDXILPassPass(*PR); initializeWriteDXILPassPass(*PR); initializeDXContainerGlobalsPass(*PR); + initializeGlobalDCELegacyPassPass(*PR); initializeDXILOpLoweringLegacyPass(*PR); initializeDXILResourceAccessLegacyPass(*PR); initializeDXILResourceImplicitBindingLegacyPass(*PR); @@ -72,6 +76,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() { initializeDXILFinalizeLinkageLegacyPass(*PR); initializeDXILPrettyPrinterLegacyPass(*PR); initializeDXILForwardHandleAccessesLegacyPass(*PR); + initializeDSELegacyPassPass(*PR); initializeDXILCBufferAccessLegacyPass(*PR); } @@ -103,6 +108,7 @@ public: FunctionPass *createTargetRegisterAllocator(bool) override { return nullptr; } void addCodeGenPrepare() override { addPass(createDXILFinalizeLinkageLegacyPass()); + addPass(createGlobalDCEPass()); addPass(createDXILResourceAccessLegacyPass()); addPass(createDXILIntrinsicExpansionLegacyPass()); addPass(createDXILCBufferAccessLegacyPass()); @@ -112,6 +118,7 @@ public: addPass(createScalarizerPass(DxilScalarOptions)); addPass(createDXILFlattenArraysLegacyPass()); addPass(createDXILForwardHandleAccessesLegacyPass()); + addPass(createDeadStoreEliminationPass()); addPass(createDXILLegalizeLegacyPass()); addPass(createDXILResourceImplicitBindingLegacyPass()); addPass(createDXILTranslateMetadataLegacyPass()); diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 22cff7c..bcddb54 100644 --- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -526,6 +526,9 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB, MI.insert(MI.begin() + 1, MCOperand::createExpr(MCConstantExpr::create(-1, getContext()))); break; + case Hexagon::Y4_crswap10: + MI.addOperand(MCOperand::createReg(Hexagon::SGP1_0)); + break; default: break; } diff --git a/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/llvm/lib/Target/Hexagon/HexagonCallingConv.td index e0302b8..fd6d873 100644 --- a/llvm/lib/Target/Hexagon/HexagonCallingConv.td +++ b/llvm/lib/Target/Hexagon/HexagonCallingConv.td @@ -6,11 +6,6 @@ // //===----------------------------------------------------------------------===// -class CCIfArgIsVarArg<CCAction A> - : CCIf<"State.isVarArg() && " - "ValNo >= static_cast<HexagonCCState&>(State)" - ".getNumNamedVarArgParams()", A>; - def CC_HexagonStack: CallingConv<[ CCIfType<[i32,v2i16,v4i8], CCAssignToStack<4,4>>, @@ -28,7 +23,7 @@ def CC_Hexagon_Legacy: CallingConv<[ CCIfByVal< CCPassByVal<8,8>>, - CCIfArgIsVarArg< + CCIfArgVarArg< CCDelegateTo<CC_HexagonStack>>, // Pass split values in pairs, allocate odd register if necessary. @@ -58,7 +53,7 @@ def CC_Hexagon: CallingConv<[ CCIfByVal< CCPassByVal<8,1>>, - CCIfArgIsVarArg< + CCIfArgVarArg< CCDelegateTo<CC_HexagonStack>>, // Pass split values in pairs, allocate odd register if necessary. diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp index a920146..b2218ab 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -1273,7 +1273,7 @@ void HexagonGenInsert::selectCandidates() { for (unsigned R = AllRMs.find_first(); R; R = AllRMs.find_next(R)) { using use_iterator = MachineRegisterInfo::use_nodbg_iterator; - using InstrSet = SmallSet<const MachineInstr *, 16>; + using InstrSet = SmallPtrSet<const MachineInstr *, 16>; InstrSet UIs; // Count as the number of instructions in which R is used, not the diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index facea64..c54b67c 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -116,23 +116,6 @@ static cl::opt<bool> cl::desc("Disable minimum alignment of 1 for " "arguments passed by value on stack")); -namespace { - - class HexagonCCState : public CCState { - unsigned NumNamedVarArgParams = 0; - - public: - HexagonCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF, - SmallVectorImpl<CCValAssign> &locs, LLVMContext &C, - unsigned NumNamedArgs) - : CCState(CC, IsVarArg, MF, locs, C), - NumNamedVarArgParams(NumNamedArgs) {} - unsigned getNumNamedVarArgParams() const { return NumNamedVarArgParams; } - }; - -} // end anonymous namespace - - // Implement calling convention for Hexagon. static bool CC_SkipOdd(unsigned &ValNo, MVT &ValVT, MVT &LocVT, @@ -497,7 +480,6 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachineFrameInfo &MFI = MF.getFrameInfo(); auto PtrVT = getPointerTy(MF.getDataLayout()); - unsigned NumParams = CLI.CB ? CLI.CB->getFunctionType()->getNumParams() : 0; if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee)) Callee = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, MVT::i32); @@ -506,8 +488,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - HexagonCCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs, *DAG.getContext(), - NumParams); + CCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs, *DAG.getContext()); if (Subtarget.useHVXOps()) CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_HVX); @@ -880,9 +861,7 @@ SDValue HexagonTargetLowering::LowerFormalArguments( // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - HexagonCCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs, - *DAG.getContext(), - MF.getFunction().getFunctionType()->getNumParams()); + CCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs, *DAG.getContext()); if (Subtarget.useHVXOps()) CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon_HVX); diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index c34eecd..a3717bb 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -2289,7 +2289,7 @@ CleanupAndExit: // the instructions in Insts are removed. bool HexagonLoopIdiomRecognize::coverLoop(Loop *L, SmallVectorImpl<Instruction*> &Insts) const { - SmallSet<BasicBlock*,8> LoopBlocks; + SmallPtrSet<BasicBlock *, 8> LoopBlocks; LoopBlocks.insert_range(L->blocks()); SetVector<Instruction *> Worklist(llvm::from_range, Insts); diff --git a/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp index 610a81f..33aa6e4 100644 --- a/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp @@ -32,14 +32,10 @@ SDValue HexagonSelectionDAGInfo::EmitTargetCodeForMemcpy( // const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering(); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); - Entry.Node = Dst; - Args.push_back(Entry); - Entry.Node = Src; - Args.push_back(Entry); - Entry.Node = Size; - Args.push_back(Entry); + Type *ArgTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Args.emplace_back(Dst, ArgTy); + Args.emplace_back(Src, ArgTy); + Args.emplace_back(Size, ArgTy); const char *SpecialMemcpyName = TLI.getLibcallName( RTLIB::HEXAGON_MEMCPY_LIKELY_ALIGNED_MIN32BYTES_MULT8BYTES); diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index ecc1b5d..6a05b5a 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -445,8 +445,8 @@ void HexagonSubtarget::adjustSchedDependency( const HexagonInstrInfo *QII = getInstrInfo(); // Instructions with .new operands have zero latency. - SmallSet<SUnit *, 4> ExclSrc; - SmallSet<SUnit *, 4> ExclDst; + SmallPtrSet<SUnit *, 4> ExclSrc; + SmallPtrSet<SUnit *, 4> ExclDst; if (QII->canExecuteInBundle(*SrcInst, *DstInst) && isBestZeroLatency(Src, Dst, QII, ExclSrc, ExclDst)) { Dep.setLatency(0); @@ -630,9 +630,9 @@ static SUnit *getZeroLatency(SUnit *N, SmallVector<SDep, 4> &Deps) { // together with a zero latency. Only one dependence should have a zero // latency. If there are multiple choices, choose the best, and change // the others, if needed. -bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst, - const HexagonInstrInfo *TII, SmallSet<SUnit*, 4> &ExclSrc, - SmallSet<SUnit*, 4> &ExclDst) const { +bool HexagonSubtarget::isBestZeroLatency( + SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII, + SmallPtrSet<SUnit *, 4> &ExclSrc, SmallPtrSet<SUnit *, 4> &ExclDst) const { MachineInstr &SrcInst = *Src->getInstr(); MachineInstr &DstInst = *Dst->getInstr(); diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index 41555db..b111471 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -366,7 +366,8 @@ private: void restoreLatency(SUnit *Src, SUnit *Dst) const; void changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat) const; bool isBestZeroLatency(SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII, - SmallSet<SUnit*, 4> &ExclSrc, SmallSet<SUnit*, 4> &ExclDst) const; + SmallPtrSet<SUnit *, 4> &ExclSrc, + SmallPtrSet<SUnit *, 4> &ExclDst) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 9fb7d47..171e294 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -156,9 +156,10 @@ HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return BaseT::getIntrinsicInstrCost(ICA, CostKind); } -InstructionCost HexagonTTIImpl::getAddressComputationCost(Type *Tp, - ScalarEvolution *SE, - const SCEV *S) const { +InstructionCost +HexagonTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, + const SCEV *S, + TTI::TargetCostKind CostKind) const { return 0; } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index af8dede7..dbf16c9 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -111,8 +111,9 @@ public: InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override; - InstructionCost getAddressComputationCost(Type *Tp, ScalarEvolution *SE, - const SCEV *S) const override; + InstructionCost + getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *S, + TTI::TargetCostKind CostKind) const override; InstructionCost getMemoryOpCost( unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index 3de6df5..87d052b 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -1677,9 +1677,9 @@ auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> { return m_CombineOr(m_LShr(V, S), m_AShr(V, S)); }; - const APInt *Qn = nullptr; - if (Value * T; match(Exp, m_Shr(m_Value(T), m_APInt(Qn)))) { - Op.Frac = Qn->getZExtValue(); + uint64_t Qn = 0; + if (Value *T; match(Exp, m_Shr(m_Value(T), m_ConstantInt(Qn)))) { + Op.Frac = Qn; Exp = T; } else { Op.Frac = 0; @@ -1689,9 +1689,9 @@ auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> { return std::nullopt; // Check if there is rounding added. - const APInt *C = nullptr; - if (Value * T; Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_APInt(C)))) { - uint64_t CV = C->getZExtValue(); + uint64_t CV; + if (Value *T; + Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_ConstantInt(CV)))) { if (CV != 0 && !isPowerOf2_64(CV)) return std::nullopt; if (CV != 0) diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp index 039ef4f..6b8d7f1 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -32,7 +32,6 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/HexagonAttributes.h" diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp index d23c5f4..7a0a510 100644 --- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp +++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp @@ -356,12 +356,13 @@ void LanaiTargetLowering::LowerAsmOperandForConstraint( static unsigned NumFixedArgs; static bool CC_Lanai32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State) { // Handle fixed arguments with default CC. // Note: Both the default and fast CC handle VarArg the same and hence the // calling convention of the function is not considered here. if (ValNo < NumFixedArgs) { - return CC_Lanai32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State); + return CC_Lanai32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, OrigTy, State); } // Promote i8/i16 args to i32 diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 6583a0f..5b2d185 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/IR/IRBuilder.h" @@ -2786,7 +2787,7 @@ SDValue LoongArchTargetLowering::lowerUINT_TO_FP(SDValue Op, EVT RetVT = Op.getValueType(); RTLIB::Libcall LC = RTLIB::getUINTTOFP(OpVT, RetVT); MakeLibCallOptions CallOptions; - CallOptions.setTypeListBeforeSoften(OpVT, RetVT, true); + CallOptions.setTypeListBeforeSoften(OpVT, RetVT); SDValue Chain = SDValue(); SDValue Result; std::tie(Result, Chain) = @@ -2811,7 +2812,7 @@ SDValue LoongArchTargetLowering::lowerSINT_TO_FP(SDValue Op, EVT RetVT = Op.getValueType(); RTLIB::Libcall LC = RTLIB::getSINTTOFP(OpVT, RetVT); MakeLibCallOptions CallOptions; - CallOptions.setTypeListBeforeSoften(OpVT, RetVT, true); + CallOptions.setTypeListBeforeSoften(OpVT, RetVT); SDValue Chain = SDValue(); SDValue Result; std::tie(Result, Chain) = @@ -3037,10 +3038,7 @@ SDValue LoongArchTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N, // Prepare argument list to generate call. ArgListTy Args; - ArgListEntry Entry; - Entry.Node = Load; - Entry.Ty = CallTy; - Args.push_back(Entry); + Args.emplace_back(Load, CallTy); // Setup call to __tls_get_addr. TargetLowering::CallLoweringInfo CLI(DAG); @@ -4107,7 +4105,7 @@ void LoongArchTargetLowering::ReplaceNodeResults( LC = RTLIB::getFPTOSINT(Src.getValueType(), VT); MakeLibCallOptions CallOptions; EVT OpVT = Src.getValueType(); - CallOptions.setTypeListBeforeSoften(OpVT, VT, true); + CallOptions.setTypeListBeforeSoften(OpVT, VT); SDValue Chain = SDValue(); SDValue Result; std::tie(Result, Chain) = @@ -4360,7 +4358,7 @@ void LoongArchTargetLowering::ReplaceNodeResults( RTLIB::Libcall LC = OpVT == MVT::f64 ? RTLIB::LROUND_F64 : RTLIB::LROUND_F32; MakeLibCallOptions CallOptions; - CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64, true); + CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64); SDValue Result = makeLibCall(DAG, LC, MVT::i64, Op0, CallOptions, DL).first; Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Result); Results.push_back(Result); @@ -6042,17 +6040,20 @@ static MachineBasicBlock * emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB, const LoongArchSubtarget &Subtarget) { unsigned InsOp; + unsigned BroadcastOp; unsigned HalfSize; switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected opcode"); case LoongArch::PseudoXVINSGR2VR_B: HalfSize = 16; - InsOp = LoongArch::VINSGR2VR_B; + BroadcastOp = LoongArch::XVREPLGR2VR_B; + InsOp = LoongArch::XVEXTRINS_B; break; case LoongArch::PseudoXVINSGR2VR_H: HalfSize = 8; - InsOp = LoongArch::VINSGR2VR_H; + BroadcastOp = LoongArch::XVREPLGR2VR_H; + InsOp = LoongArch::XVEXTRINS_H; break; } const TargetInstrInfo *TII = Subtarget.getInstrInfo(); @@ -6066,37 +6067,41 @@ emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB, Register Elt = MI.getOperand(2).getReg(); unsigned Idx = MI.getOperand(3).getImm(); - Register ScratchReg1 = XSrc; - if (Idx >= HalfSize) { - ScratchReg1 = MRI.createVirtualRegister(RC); - BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_D), ScratchReg1) - .addReg(XSrc) - .addImm(14); - } + if (XSrc.isVirtual() && MRI.getVRegDef(XSrc)->isImplicitDef() && + Idx < HalfSize) { + Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC); + Register ScratchSubReg2 = MRI.createVirtualRegister(SubRC); - Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC); - Register ScratchSubReg2 = MRI.createVirtualRegister(SubRC); - BuildMI(*BB, MI, DL, TII->get(LoongArch::COPY), ScratchSubReg1) - .addReg(ScratchReg1, 0, LoongArch::sub_128); - BuildMI(*BB, MI, DL, TII->get(InsOp), ScratchSubReg2) - .addReg(ScratchSubReg1) - .addReg(Elt) - .addImm(Idx >= HalfSize ? Idx - HalfSize : Idx); + BuildMI(*BB, MI, DL, TII->get(LoongArch::COPY), ScratchSubReg1) + .addReg(XSrc, 0, LoongArch::sub_128); + BuildMI(*BB, MI, DL, + TII->get(HalfSize == 8 ? LoongArch::VINSGR2VR_H + : LoongArch::VINSGR2VR_B), + ScratchSubReg2) + .addReg(ScratchSubReg1) + .addReg(Elt) + .addImm(Idx); + + BuildMI(*BB, MI, DL, TII->get(LoongArch::SUBREG_TO_REG), XDst) + .addImm(0) + .addReg(ScratchSubReg2) + .addImm(LoongArch::sub_128); + } else { + Register ScratchReg1 = MRI.createVirtualRegister(RC); + Register ScratchReg2 = MRI.createVirtualRegister(RC); - Register ScratchReg2 = XDst; - if (Idx >= HalfSize) - ScratchReg2 = MRI.createVirtualRegister(RC); + BuildMI(*BB, MI, DL, TII->get(BroadcastOp), ScratchReg1).addReg(Elt); - BuildMI(*BB, MI, DL, TII->get(LoongArch::SUBREG_TO_REG), ScratchReg2) - .addImm(0) - .addReg(ScratchSubReg2) - .addImm(LoongArch::sub_128); + BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), ScratchReg2) + .addReg(ScratchReg1) + .addReg(XSrc) + .addImm(Idx >= HalfSize ? 48 : 18); - if (Idx >= HalfSize) - BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), XDst) + BuildMI(*BB, MI, DL, TII->get(InsOp), XDst) .addReg(XSrc) .addReg(ScratchReg2) - .addImm(2); + .addImm((Idx >= HalfSize ? Idx - HalfSize : Idx) * 17); + } MI.eraseFromParent(); return BB; @@ -7073,7 +7078,8 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val, static bool CC_LoongArch_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State) { if (LocVT == MVT::i32 || LocVT == MVT::i64) { // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, SpLim // s0 s1 s2 s3 s4 s5 s6 s7 s8 diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index d8bb16f..0696b11 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1640,6 +1640,24 @@ defm : PairInsertExtractPatV8<v8f32, f32>; defm : PairInsertExtractPatV4<v4i64, GRLenVT>; defm : PairInsertExtractPatV4<v4f64, f64>; +def : Pat<(vector_insert v8i32:$xd, (GRLenVT(vector_extract v8i32:$xj, 0)), + uimm3:$imm), + (XVINSVE0_W v8i32:$xd, v8i32:$xj, uimm3:$imm)>; + +def : Pat<(vector_insert v4i64:$xd, (GRLenVT(vector_extract v4i64:$xj, 0)), + uimm2:$imm), + (XVINSVE0_D v4i64:$xd, v4i64:$xj, uimm2:$imm)>; + +def : Pat<(vector_insert v8i32:$xd, + (GRLenVT(vector_extract v8i32:$xj, uimm3:$imm1)), uimm3:$imm2), + (XVINSVE0_W v8i32:$xd, (XVPICKVE_W v8i32:$xj, uimm3:$imm1), + uimm3:$imm2)>; + +def : Pat<(vector_insert v4i64:$xd, + (GRLenVT(vector_extract v4i64:$xj, uimm2:$imm1)), uimm2:$imm2), + (XVINSVE0_D v4i64:$xd, (XVPICKVE_D v4i64:$xj, uimm2:$imm1), + uimm2:$imm2)>; + // PseudoXVINSGR2VR_{B/H} def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm), (PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index ca5d27d..3b38ac9 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -143,8 +143,6 @@ static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, uint8_t *Data, void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, const MCValue &Target, uint8_t *Data, uint64_t Value, bool IsResolved) { - if (IsResolved && shouldForceRelocation(Fixup, Target)) - IsResolved = false; IsResolved = addReloc(F, Fixup, Target, Value, IsResolved); if (!Value) return; // Doesn't change encoding. @@ -176,20 +174,6 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, } } -bool LoongArchAsmBackend::shouldForceRelocation(const MCFixup &Fixup, - const MCValue &Target) { - switch (Fixup.getKind()) { - default: - return STI.hasFeature(LoongArch::FeatureRelax); - case FK_Data_1: - case FK_Data_2: - case FK_Data_4: - case FK_Data_8: - case FK_Data_leb128: - return !Target.isAbsolute(); - } -} - static inline std::pair<MCFixupKind, MCFixupKind> getRelocPairForSize(unsigned Size) { switch (Size) { @@ -216,10 +200,19 @@ getRelocPairForSize(unsigned Size) { // size, the fixup encodes MaxBytesToEmit in the higher bits and references a // per-section marker symbol. bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { + // Alignments before the first linker-relaxable instruction have fixed sizes + // and do not require relocations. Alignments after a linker-relaxable + // instruction require a relocation, even if the STI specifies norelax. + // + // firstLinkerRelaxable is the layout order within the subsection, which may + // be smaller than the section's order. Therefore, alignments in a + // lower-numbered subsection may be unnecessarily treated as linker-relaxable. + auto *Sec = F.getParent(); + if (F.getLayoutOrder() <= Sec->firstLinkerRelaxable()) + return false; + // Use default handling unless linker relaxation is enabled and the // MaxBytesToEmit >= the nop size. - if (!F.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax)) - return false; const unsigned MinNopLen = 4; unsigned MaxBytesToEmit = F.getAlignMaxBytesToEmit(); if (MaxBytesToEmit < MinNopLen) @@ -254,8 +247,6 @@ bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN); F.setVarFixups({Fixup}); F.setLinkerRelaxable(); - if (!F.getParent()->isLinkerRelaxable()) - F.getParent()->setFirstLinkerRelaxable(F.getLayoutOrder()); return true; } @@ -448,10 +439,10 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup, isPCRelFixupResolved(Target.getSubSym(), F)) return Fallback(); - // In SecA == SecB case. If the linker relaxation is disabled, the + // In SecA == SecB case. If the section is not linker-relaxable, the // FixedValue has already been calculated out in evaluateFixup, // return true and avoid record relocations. - if (&SecA == &SecB && !STI.hasFeature(LoongArch::FeatureRelax)) + if (&SecA == &SecB && !SecA.isLinkerRelaxable()) return true; } @@ -484,9 +475,16 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup, return false; } - IsResolved = Fallback(); // If linker relaxation is enabled and supported by the current relocation, - // append a RELAX relocation. + // generate a relocation and then append a RELAX. + if (Fixup.isLinkerRelaxable()) + IsResolved = false; + if (IsResolved && Fixup.isPCRel()) + IsResolved = isPCRelFixupResolved(Target.getAddSym(), F); + + if (!IsResolved) + Asm->getWriter().recordRelocation(F, Fixup, Target, FixedValue); + if (Fixup.isLinkerRelaxable()) { auto FA = MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_LARCH_RELAX); Asm->getWriter().recordRelocation(F, FA, MCValue::get(nullptr), @@ -498,8 +496,7 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup, std::unique_ptr<MCObjectTargetWriter> LoongArchAsmBackend::createObjectTargetWriter() const { - return createLoongArchELFObjectWriter( - OSABI, Is64Bit, STI.hasFeature(LoongArch::FeatureRelax)); + return createLoongArchELFObjectWriter(OSABI, Is64Bit); } MCAsmBackend *llvm::createLoongArchAsmBackend(const Target &T, diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h index 1f13601..f79d3aa 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h @@ -44,8 +44,6 @@ public: void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, uint8_t *Data, uint64_t Value, bool IsResolved) override; - bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target); - std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index 7e021e4..7d54565 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -21,26 +21,23 @@ using namespace llvm; namespace { class LoongArchELFObjectWriter : public MCELFObjectTargetWriter { public: - LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool EnableRelax); + LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit); ~LoongArchELFObjectWriter() override; bool needsRelocateWithSymbol(const MCValue &, unsigned Type) const override { - return EnableRelax; + return true; } protected: unsigned getRelocType(const MCFixup &, const MCValue &, bool IsPCRel) const override; - bool EnableRelax; }; } // end namespace -LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, - bool EnableRelax) +LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit) : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_LOONGARCH, - /*HasRelocationAddend=*/true), - EnableRelax(EnableRelax) {} + /*HasRelocationAddend=*/true) {} LoongArchELFObjectWriter::~LoongArchELFObjectWriter() {} @@ -103,6 +100,6 @@ unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup, } std::unique_ptr<MCObjectTargetWriter> -llvm::createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool Relax) { - return std::make_unique<LoongArchELFObjectWriter>(OSABI, Is64Bit, Relax); +llvm::createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit) { + return std::make_unique<LoongArchELFObjectWriter>(OSABI, Is64Bit); } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h index bb05baa..ab35a00 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h @@ -36,7 +36,7 @@ MCAsmBackend *createLoongArchAsmBackend(const Target &T, const MCTargetOptions &Options); std::unique_ptr<MCObjectTargetWriter> -createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool Relax); +createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit); } // end namespace llvm diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp index 594ea9f..12c6e1e 100644 --- a/llvm/lib/Target/M68k/M68kISelLowering.cpp +++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp @@ -51,7 +51,9 @@ M68kTargetLowering::M68kTargetLowering(const M68kTargetMachine &TM, MVT PtrVT = MVT::i32; - setBooleanContents(ZeroOrOneBooleanContent); + // This is based on M68k SetCC (scc) setting the destination byte to all 1s. + // See also getSetCCResultType(). + setBooleanContents(ZeroOrNegativeOneBooleanContent); auto *RegInfo = Subtarget.getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); @@ -1454,10 +1456,7 @@ SDValue M68kTargetLowering::getTLSGetAddr(GlobalAddressSDNode *GA, PointerType *PtrTy = PointerType::get(*DAG.getContext(), 0); ArgListTy Args; - ArgListEntry Entry; - Entry.Node = Arg; - Entry.Ty = PtrTy; - Args.push_back(Entry); + Args.emplace_back(Arg, PtrTy); return LowerExternalSymbolCall(DAG, SDLoc(GA), "__tls_get_addr", std::move(Args)); } diff --git a/llvm/lib/Target/M68k/M68kInstrArithmetic.td b/llvm/lib/Target/M68k/M68kInstrArithmetic.td index e2d4e49..56b71db 100644 --- a/llvm/lib/Target/M68k/M68kInstrArithmetic.td +++ b/llvm/lib/Target/M68k/M68kInstrArithmetic.td @@ -835,7 +835,7 @@ def : Pat<(MxSub 0, i8 :$src), (NEG8d MxDRD8 :$src)>; def : Pat<(MxSub 0, i16:$src), (NEG16d MxDRD16:$src)>; def : Pat<(MxSub 0, i32:$src), (NEG32d MxDRD32:$src)>; // SExt of i1 values. -// Although we specify `ZeroOrOneBooleanContent` for boolean content, +// Although we specify `ZeroOrNegativeOneBooleanContent` for boolean content, // we're still adding an AND here as we don't know the origin of the i1 value. def : Pat<(sext_inreg i8:$src, i1), (NEG8d (AND8di MxDRD8:$src, 1))>; def : Pat<(sext_inreg i16:$src, i1), (NEG16d (AND16di MxDRD16:$src, 1))>; diff --git a/llvm/lib/Target/M68k/M68kInstrData.td b/llvm/lib/Target/M68k/M68kInstrData.td index f4ed627..c5b7ae3 100644 --- a/llvm/lib/Target/M68k/M68kInstrData.td +++ b/llvm/lib/Target/M68k/M68kInstrData.td @@ -701,18 +701,22 @@ def: Pat<(MxExtLoadi16i8 MxCP_ARID:$src), (EXTRACT_SUBREG (MOVZXd32p8 MxARID8:$src), MxSubRegIndex16Lo)>; def: Pat<(MxExtLoadi16i8 MxCP_ARII:$src), (EXTRACT_SUBREG (MOVZXd32f8 MxARII8:$src), MxSubRegIndex16Lo)>; +def: Pat<(MxExtLoadi16i8 MxCP_PCD:$src), + (EXTRACT_SUBREG (MOVZXd32q8 MxPCD8:$src), MxSubRegIndex16Lo)>; // i32 <- anyext i8 def: Pat<(i32 (anyext i8:$src)), (MOVZXd32d8 MxDRD8:$src)>; def: Pat<(MxExtLoadi32i8 MxCP_ARI :$src), (MOVZXd32j8 MxARI8 :$src)>; def: Pat<(MxExtLoadi32i8 MxCP_ARID:$src), (MOVZXd32p8 MxARID8:$src)>; def: Pat<(MxExtLoadi32i8 MxCP_ARII:$src), (MOVZXd32f8 MxARII8:$src)>; +def: Pat<(MxExtLoadi32i8 MxCP_PCD:$src), (MOVZXd32q8 MxPCD8:$src)>; // i32 <- anyext i16 def: Pat<(i32 (anyext i16:$src)), (MOVZXd32d16 MxDRD16:$src)>; def: Pat<(MxExtLoadi32i16 MxCP_ARI :$src), (MOVZXd32j16 MxARI16 :$src)>; def: Pat<(MxExtLoadi32i16 MxCP_ARID:$src), (MOVZXd32p16 MxARID16:$src)>; def: Pat<(MxExtLoadi32i16 MxCP_ARII:$src), (MOVZXd32f16 MxARII16:$src)>; +def: Pat<(MxExtLoadi32i16 MxCP_PCD:$src), (MOVZXd32q16 MxPCD16:$src)>; // trunc patterns def : Pat<(i16 (trunc i32:$src)), diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp index d23504c..6da5e66 100644 --- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -377,6 +377,7 @@ static void AnalyzeArguments(CCState &State, for (unsigned i = 0, e = ArgsParts.size(); i != e; i++) { MVT ArgVT = Args[ValNo].VT; ISD::ArgFlagsTy ArgFlags = Args[ValNo].Flags; + Type *OrigTy = Args[ValNo].OrigTy; MVT LocVT = ArgVT; CCValAssign::LocInfo LocInfo = CCValAssign::Full; @@ -411,7 +412,8 @@ static void AnalyzeArguments(CCState &State, RegsLeft -= 1; UsedStack = true; - CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State); + CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, OrigTy, + State); } else if (Parts <= RegsLeft) { for (unsigned j = 0; j < Parts; j++) { MCRegister Reg = State.AllocateReg(RegList); @@ -421,7 +423,8 @@ static void AnalyzeArguments(CCState &State, } else { UsedStack = true; for (unsigned j = 0; j < Parts; j++) - CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State); + CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, OrigTy, + State); } } } diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 16247bd..680d279 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -17,7 +17,6 @@ #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCValue.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index feeadc5e..a8b7c9e 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -18,7 +18,6 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/Support/Casting.h" using namespace llvm; diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index 5df70c4..1e1b970 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -26,7 +26,6 @@ #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index 4530fc6..ae91c97 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -51,7 +51,6 @@ #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" diff --git a/llvm/lib/Target/Mips/MipsCCState.cpp b/llvm/lib/Target/Mips/MipsCCState.cpp index 13237c5..86bb3e6 100644 --- a/llvm/lib/Target/Mips/MipsCCState.cpp +++ b/llvm/lib/Target/Mips/MipsCCState.cpp @@ -12,59 +12,6 @@ using namespace llvm; -bool MipsCCState::isF128SoftLibCall(const char *CallSym) { - const char *const LibCalls[] = { - "__addtf3", "__divtf3", "__eqtf2", "__extenddftf2", - "__extendsftf2", "__fixtfdi", "__fixtfsi", "__fixtfti", - "__fixunstfdi", "__fixunstfsi", "__fixunstfti", "__floatditf", - "__floatsitf", "__floattitf", "__floatunditf", "__floatunsitf", - "__floatuntitf", "__getf2", "__gttf2", "__letf2", - "__lttf2", "__multf3", "__netf2", "__powitf2", - "__subtf3", "__trunctfdf2", "__trunctfsf2", "__unordtf2", - "ceill", "copysignl", "cosl", "exp2l", - "expl", "floorl", "fmal", "fmaxl", - "fmodl", "log10l", "log2l", "logl", - "nearbyintl", "powl", "rintl", "roundl", - "sinl", "sqrtl", "truncl"}; - - // Check that LibCalls is sorted alphabetically. - auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; }; - assert(llvm::is_sorted(LibCalls, Comp)); - return llvm::binary_search(LibCalls, CallSym, Comp); -} - -/// This function returns true if Ty is fp128, {f128} or i128 which was -/// originally a fp128. -bool MipsCCState::originalTypeIsF128(const Type *Ty, const char *Func) { - if (Ty->isFP128Ty()) - return true; - - if (Ty->isStructTy() && Ty->getStructNumElements() == 1 && - Ty->getStructElementType(0)->isFP128Ty()) - return true; - - // If the Ty is i128 and the function being called is a long double emulation - // routine, then the original type is f128. - // FIXME: This is unsound because these functions could be indirectly called - return (Func && Ty->isIntegerTy(128) && isF128SoftLibCall(Func)); -} - -/// Return true if the original type was vXfXX. -bool MipsCCState::originalEVTTypeIsVectorFloat(EVT Ty) { - if (Ty.isVector() && Ty.getVectorElementType().isFloatingPoint()) - return true; - - return false; -} - -/// Return true if the original type was vXfXX / vXfXX. -bool MipsCCState::originalTypeIsVectorFloat(const Type *Ty) { - if (Ty->isVectorTy() && Ty->isFPOrFPVectorTy()) - return true; - - return false; -} - MipsCCState::SpecialCallingConvType MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee, const MipsSubtarget &Subtarget) { @@ -81,123 +28,3 @@ MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee, } return SpecialCallingConv; } - -void MipsCCState::PreAnalyzeCallResultForF128( - const SmallVectorImpl<ISD::InputArg> &Ins, - const Type *RetTy, const char *Call) { - for (unsigned i = 0; i < Ins.size(); ++i) { - OriginalArgWasF128.push_back( - originalTypeIsF128(RetTy, Call)); - OriginalArgWasFloat.push_back(RetTy->isFloatingPointTy()); - } -} - -/// Identify lowered values that originated from f128 or float arguments and -/// record this for use by RetCC_MipsN. -void MipsCCState::PreAnalyzeCallReturnForF128( - const SmallVectorImpl<ISD::OutputArg> &Outs, const Type *RetTy) { - for (unsigned i = 0; i < Outs.size(); ++i) { - OriginalArgWasF128.push_back( - originalTypeIsF128(RetTy, nullptr)); - OriginalArgWasFloat.push_back( - RetTy->isFloatingPointTy()); - } -} - -/// Identify lower values that originated from vXfXX and record -/// this. -void MipsCCState::PreAnalyzeCallResultForVectorFloat( - const SmallVectorImpl<ISD::InputArg> &Ins, const Type *RetTy) { - for (unsigned i = 0; i < Ins.size(); ++i) { - OriginalRetWasFloatVector.push_back(originalTypeIsVectorFloat(RetTy)); - } -} - -/// Identify lowered values that originated from vXfXX arguments and record -/// this. -void MipsCCState::PreAnalyzeReturnForVectorFloat( - const SmallVectorImpl<ISD::OutputArg> &Outs) { - for (unsigned i = 0; i < Outs.size(); ++i) { - ISD::OutputArg Out = Outs[i]; - OriginalRetWasFloatVector.push_back( - originalEVTTypeIsVectorFloat(Out.ArgVT)); - } -} - -void MipsCCState::PreAnalyzeReturnValue(EVT ArgVT) { - OriginalRetWasFloatVector.push_back(originalEVTTypeIsVectorFloat(ArgVT)); -} - -void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, const char *Func) { - OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, Func)); - OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy()); - OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy()); -} - -/// Identify lowered values that originated from f128, float and sret to vXfXX -/// arguments and record this. -void MipsCCState::PreAnalyzeCallOperands( - const SmallVectorImpl<ISD::OutputArg> &Outs, - std::vector<TargetLowering::ArgListEntry> &FuncArgs, - const char *Func) { - for (unsigned i = 0; i < Outs.size(); ++i) { - TargetLowering::ArgListEntry FuncArg = FuncArgs[Outs[i].OrigArgIndex]; - - OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg.Ty, Func)); - OriginalArgWasFloat.push_back(FuncArg.Ty->isFloatingPointTy()); - OriginalArgWasFloatVector.push_back(FuncArg.Ty->isVectorTy()); - } -} - -void MipsCCState::PreAnalyzeFormalArgument(const Type *ArgTy, - ISD::ArgFlagsTy Flags) { - // SRet arguments cannot originate from f128 or {f128} returns so we just - // push false. We have to handle this specially since SRet arguments - // aren't mapped to an original argument. - if (Flags.isSRet()) { - OriginalArgWasF128.push_back(false); - OriginalArgWasFloat.push_back(false); - OriginalArgWasFloatVector.push_back(false); - return; - } - - OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, nullptr)); - OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy()); - - // The MIPS vector ABI exhibits a corner case of sorts or quirk; if the - // first argument is actually an SRet pointer to a vector, then the next - // argument slot is $a2. - OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy()); -} - -/// Identify lowered values that originated from f128, float and vXfXX arguments -/// and record this. -void MipsCCState::PreAnalyzeFormalArgumentsForF128( - const SmallVectorImpl<ISD::InputArg> &Ins) { - const MachineFunction &MF = getMachineFunction(); - for (unsigned i = 0; i < Ins.size(); ++i) { - Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); - - // SRet arguments cannot originate from f128 or {f128} returns so we just - // push false. We have to handle this specially since SRet arguments - // aren't mapped to an original argument. - if (Ins[i].Flags.isSRet()) { - OriginalArgWasF128.push_back(false); - OriginalArgWasFloat.push_back(false); - OriginalArgWasFloatVector.push_back(false); - continue; - } - - assert(Ins[i].getOrigArgIndex() < MF.getFunction().arg_size()); - std::advance(FuncArg, Ins[i].getOrigArgIndex()); - - OriginalArgWasF128.push_back( - originalTypeIsF128(FuncArg->getType(), nullptr)); - OriginalArgWasFloat.push_back(FuncArg->getType()->isFloatingPointTy()); - - // The MIPS vector ABI exhibits a corner case of sorts or quirk; if the - // first argument is actually an SRet pointer to a vector, then the next - // argument slot is $a2. - OriginalArgWasFloatVector.push_back(FuncArg->getType()->isVectorTy()); - } -} diff --git a/llvm/lib/Target/Mips/MipsCCState.h b/llvm/lib/Target/Mips/MipsCCState.h index 30b68e8..4c36d42 100644 --- a/llvm/lib/Target/Mips/MipsCCState.h +++ b/llvm/lib/Target/Mips/MipsCCState.h @@ -26,66 +26,7 @@ public: getSpecialCallingConvForCallee(const SDNode *Callee, const MipsSubtarget &Subtarget); - /// This function returns true if CallSym is a long double emulation routine. - /// - /// FIXME: Changing the ABI based on the callee name is unsound. The lib func - /// address could be captured. - static bool isF128SoftLibCall(const char *CallSym); - - static bool originalTypeIsF128(const Type *Ty, const char *Func); - static bool originalEVTTypeIsVectorFloat(EVT Ty); - static bool originalTypeIsVectorFloat(const Type *Ty); - - void PreAnalyzeCallOperand(const Type *ArgTy, const char *Func); - - void PreAnalyzeFormalArgument(const Type *ArgTy, ISD::ArgFlagsTy Flags); - void PreAnalyzeReturnValue(EVT ArgVT); - private: - /// Identify lowered values that originated from f128 arguments and record - /// this for use by RetCC_MipsN. - void PreAnalyzeCallResultForF128(const SmallVectorImpl<ISD::InputArg> &Ins, - const Type *RetTy, const char * Func); - - /// Identify lowered values that originated from f128 arguments and record - /// this for use by RetCC_MipsN. - void PreAnalyzeCallReturnForF128(const SmallVectorImpl<ISD::OutputArg> &Outs, const Type *RetTy); - - /// Identify lowered values that originated from f128 arguments and record - /// this. - void - PreAnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, - std::vector<TargetLowering::ArgListEntry> &FuncArgs, - const char *Func); - - /// Identify lowered values that originated from f128 arguments and record - /// this for use by RetCC_MipsN. - void - PreAnalyzeFormalArgumentsForF128(const SmallVectorImpl<ISD::InputArg> &Ins); - - void - PreAnalyzeCallResultForVectorFloat(const SmallVectorImpl<ISD::InputArg> &Ins, - const Type *RetTy); - - void PreAnalyzeFormalArgumentsForVectorFloat( - const SmallVectorImpl<ISD::InputArg> &Ins); - - void - PreAnalyzeReturnForVectorFloat(const SmallVectorImpl<ISD::OutputArg> &Outs); - - /// Records whether the value has been lowered from an f128. - SmallVector<bool, 4> OriginalArgWasF128; - - /// Records whether the value has been lowered from float. - SmallVector<bool, 4> OriginalArgWasFloat; - - /// Records whether the value has been lowered from a floating point vector. - SmallVector<bool, 4> OriginalArgWasFloatVector; - - /// Records whether the return value has been lowered from a floating point - /// vector. - SmallVector<bool, 4> OriginalRetWasFloatVector; - // Used to handle MIPS16-specific calling convention tweaks. // FIXME: This should probably be a fully fledged calling convention. SpecialCallingConvType SpecialCallingConv; @@ -96,118 +37,6 @@ public: SpecialCallingConvType SpecialCC = NoSpecialCallingConv) : CCState(CC, isVarArg, MF, locs, C), SpecialCallingConv(SpecialCC) {} - void PreAnalyzeCallOperands( - const SmallVectorImpl<ISD::OutputArg> &Outs, CCAssignFn Fn, - std::vector<TargetLowering::ArgListEntry> &FuncArgs, const char *Func) { - OriginalArgWasF128.clear(); - OriginalArgWasFloat.clear(); - OriginalArgWasFloatVector.clear(); - PreAnalyzeCallOperands(Outs, FuncArgs, Func); - } - - void - AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, - CCAssignFn Fn, - std::vector<TargetLowering::ArgListEntry> &FuncArgs, - const char *Func) { - PreAnalyzeCallOperands(Outs, Fn, FuncArgs, Func); - CCState::AnalyzeCallOperands(Outs, Fn); - } - - // The AnalyzeCallOperands in the base class is not usable since we must - // provide a means of accessing ArgListEntry::IsFixed. Delete them from this - // class. This doesn't stop them being used via the base class though. - void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, - CCAssignFn Fn) = delete; - void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs, - SmallVectorImpl<ISD::ArgFlagsTy> &Flags, - CCAssignFn Fn) = delete; - - void PreAnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins, - CCAssignFn Fn) { - OriginalArgWasFloat.clear(); - OriginalArgWasF128.clear(); - OriginalArgWasFloatVector.clear(); - PreAnalyzeFormalArgumentsForF128(Ins); - } - - void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins, - CCAssignFn Fn) { - PreAnalyzeFormalArguments(Ins, Fn); - CCState::AnalyzeFormalArguments(Ins, Fn); - } - - void PreAnalyzeCallResult(const Type *RetTy, const char *Func) { - OriginalArgWasF128.push_back(originalTypeIsF128(RetTy, Func)); - OriginalArgWasFloat.push_back(RetTy->isFloatingPointTy()); - OriginalRetWasFloatVector.push_back(originalTypeIsVectorFloat(RetTy)); - } - - void PreAnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins, - CCAssignFn Fn, const Type *RetTy, - const char *Func) { - OriginalArgWasFloat.clear(); - OriginalArgWasF128.clear(); - OriginalArgWasFloatVector.clear(); - PreAnalyzeCallResultForF128(Ins, RetTy, Func); - PreAnalyzeCallResultForVectorFloat(Ins, RetTy); - } - - void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins, - CCAssignFn Fn, const Type *RetTy, - const char *Func) { - PreAnalyzeCallResult(Ins, Fn, RetTy, Func); - CCState::AnalyzeCallResult(Ins, Fn); - } - - void PreAnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, - CCAssignFn Fn) { - const MachineFunction &MF = getMachineFunction(); - OriginalArgWasFloat.clear(); - OriginalArgWasF128.clear(); - OriginalArgWasFloatVector.clear(); - PreAnalyzeCallReturnForF128(Outs, MF.getFunction().getReturnType()); - PreAnalyzeReturnForVectorFloat(Outs); - } - - void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, - CCAssignFn Fn) { - PreAnalyzeReturn(Outs, Fn); - CCState::AnalyzeReturn(Outs, Fn); - } - - bool CheckReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags, - CCAssignFn Fn) { - const MachineFunction &MF = getMachineFunction(); - PreAnalyzeCallReturnForF128(ArgsFlags, MF.getFunction().getReturnType()); - PreAnalyzeReturnForVectorFloat(ArgsFlags); - bool Return = CCState::CheckReturn(ArgsFlags, Fn); - OriginalArgWasFloat.clear(); - OriginalArgWasF128.clear(); - OriginalArgWasFloatVector.clear(); - return Return; - } - - bool CheckCallReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags, - CCAssignFn Fn, const Type *RetTy) { - PreAnalyzeCallReturnForF128(ArgsFlags, RetTy); - PreAnalyzeReturnForVectorFloat(ArgsFlags); - bool Return = CCState::CheckReturn(ArgsFlags, Fn); - OriginalArgWasFloat.clear(); - OriginalArgWasF128.clear(); - OriginalArgWasFloatVector.clear(); - return Return; - } - bool WasOriginalArgF128(unsigned ValNo) { return OriginalArgWasF128[ValNo]; } - bool WasOriginalArgFloat(unsigned ValNo) { - return OriginalArgWasFloat[ValNo]; - } - bool WasOriginalArgVectorFloat(unsigned ValNo) const { - return OriginalArgWasFloatVector[ValNo]; - } - bool WasOriginalRetVectorFloat(unsigned ValNo) const { - return OriginalRetWasFloatVector[ValNo]; - } SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; } }; } diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp index fa49108..35194e7 100644 --- a/llvm/lib/Target/Mips/MipsCallLowering.cpp +++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp @@ -26,62 +26,6 @@ MipsCallLowering::MipsCallLowering(const MipsTargetLowering &TLI) : CallLowering(&TLI) {} namespace { -struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner { - /// This is the name of the function being called - /// FIXME: Relying on this is unsound - const char *Func = nullptr; - - /// Is this a return value, or an outgoing call operand. - bool IsReturn; - - MipsOutgoingValueAssigner(CCAssignFn *AssignFn_, const char *Func, - bool IsReturn) - : OutgoingValueAssigner(AssignFn_), Func(Func), IsReturn(IsReturn) {} - - bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, - CCState &State_) override { - MipsCCState &State = static_cast<MipsCCState &>(State_); - - if (IsReturn) - State.PreAnalyzeReturnValue(EVT::getEVT(Info.Ty)); - else - State.PreAnalyzeCallOperand(Info.Ty, Func); - - return CallLowering::OutgoingValueAssigner::assignArg( - ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State); - } -}; - -struct MipsIncomingValueAssigner : public CallLowering::IncomingValueAssigner { - /// This is the name of the function being called - /// FIXME: Relying on this is unsound - const char *Func = nullptr; - - /// Is this a call return value, or an incoming function argument. - bool IsReturn; - - MipsIncomingValueAssigner(CCAssignFn *AssignFn_, const char *Func, - bool IsReturn) - : IncomingValueAssigner(AssignFn_), Func(Func), IsReturn(IsReturn) {} - - bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, - CCState &State_) override { - MipsCCState &State = static_cast<MipsCCState &>(State_); - - if (IsReturn) - State.PreAnalyzeCallResult(Info.Ty, Func); - else - State.PreAnalyzeFormalArgument(Info.Ty, Flags); - - return CallLowering::IncomingValueAssigner::assignArg( - ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State); - } -}; - class MipsIncomingValueHandler : public CallLowering::IncomingValueHandler { const MipsSubtarget &STI; @@ -339,9 +283,7 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, F.getContext()); MipsOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret); - std::string FuncName = F.getName().str(); - MipsOutgoingValueAssigner Assigner(TLI.CCAssignFnForReturn(), - FuncName.c_str(), /*IsReturn*/ true); + OutgoingValueAssigner Assigner(TLI.CCAssignFnForReturn()); if (!determineAssignments(Assigner, RetInfos, CCInfo)) return false; @@ -392,9 +334,7 @@ bool MipsCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(F.getCallingConv()), Align(1)); - const std::string FuncName = F.getName().str(); - MipsIncomingValueAssigner Assigner(TLI.CCAssignFnForCall(), FuncName.c_str(), - /*IsReturn*/ false); + IncomingValueAssigner Assigner(TLI.CCAssignFnForCall()); if (!determineAssignments(Assigner, ArgInfos, CCInfo)) return false; @@ -510,11 +450,7 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(Info.CallConv), Align(1)); - const char *Call = - Info.Callee.isSymbol() ? Info.Callee.getSymbolName() : nullptr; - - MipsOutgoingValueAssigner Assigner(TLI.CCAssignFnForCall(), Call, - /*IsReturn*/ false); + OutgoingValueAssigner Assigner(TLI.CCAssignFnForCall()); if (!determineAssignments(Assigner, ArgInfos, CCInfo)) return false; @@ -550,11 +486,8 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLowering::splitToValueTypes(Info.OrigRet, ArgInfos, DL, F.getCallingConv()); - const std::string FuncName = F.getName().str(); SmallVector<CCValAssign, 8> ArgLocs; - MipsIncomingValueAssigner Assigner(TLI.CCAssignFnForReturn(), - FuncName.c_str(), - /*IsReturn*/ true); + IncomingValueAssigner Assigner(TLI.CCAssignFnForReturn()); CallReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), MIB); MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td index 0e5c16c..3501f9fb 100644 --- a/llvm/lib/Target/Mips/MipsCallingConv.td +++ b/llvm/lib/Target/Mips/MipsCallingConv.td @@ -20,19 +20,15 @@ class CCIfSubtargetNot<string F, CCAction A> : CCIfSubtarget<F, A, "!">; /// Match if the original argument (before lowering) was a float. /// For example, this is true for i32's that were lowered from soft-float. -class CCIfOrigArgWasFloat<CCAction A> - : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgFloat(ValNo)", - A>; +class CCIfOrigArgWasFloat<CCAction A> : CCIf<"OrigTy->isFloatingPointTy()", A>; /// Match if the original argument (before lowering) was a 128-bit float (i.e. /// long double). -class CCIfOrigArgWasF128<CCAction A> - : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)", A>; +class CCIfOrigArgWasF128<CCAction A> : CCIf<"OrigTy->isFP128Ty()", A>; -/// Match if the return was a floating point vector. +/// Match if the return was not a floating point vector. class CCIfOrigArgWasNotVectorFloat<CCAction A> - : CCIf<"!static_cast<MipsCCState *>(&State)" - "->WasOriginalRetVectorFloat(ValNo)", A>; + : CCIf<"!OrigTy->isVectorTy() || !OrigTy->isFPOrFPVectorTy()", A>; /// Match if the special calling conv is the specified value. class CCIfSpecialCallingConv<string CC, CCAction A> diff --git a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp index 8067dbc..2a2ccf7 100644 --- a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp +++ b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp @@ -232,7 +232,7 @@ namespace { /// NewWaterList - The subset of WaterList that was created since the /// previous iteration by inserting unconditional branches. - SmallSet<MachineBasicBlock*, 4> NewWaterList; + SmallPtrSet<MachineBasicBlock *, 4> NewWaterList; using water_iterator = std::vector<MachineBasicBlock *>::iterator; diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp index f3812d1..1ce8d7e3 100644 --- a/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -266,17 +266,19 @@ public: static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State) LLVM_ATTRIBUTE_UNUSED; + Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED; static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State) { llvm_unreachable("should not be called"); } static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State) { llvm_unreachable("should not be called"); } @@ -1144,8 +1146,12 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI, unsigned &NumBytes) { CallingConv::ID CC = CLI.CallConv; SmallVector<CCValAssign, 16> ArgLocs; + SmallVector<Type *, 16> ArgTys; + for (const ArgListEntry &Arg : CLI.Args) + ArgTys.push_back(Arg.Val->getType()); CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context); - CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC)); + CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, ArgTys, + CCAssignFnForCall(CC)); // Get a count of how many bytes are to be pushed on the stack. NumBytes = CCInfo.getStackSize(); // This is the minimum argument area used for A0-A3. @@ -1287,9 +1293,7 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT, SmallVector<CCValAssign, 16> RVLocs; MipsCCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context); - CCInfo.AnalyzeCallResult(CLI.Ins, RetCC_Mips, CLI.RetTy, - CLI.Symbol ? CLI.Symbol->getName().data() - : nullptr); + CCInfo.AnalyzeCallResult(CLI.Ins, RetCC_Mips); // Only handle a single return value. if (RVLocs.size() != 1) diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 881ba8e..1491300 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -2325,10 +2325,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const SDValue TlsGetAddr = DAG.getExternalSymbol("__tls_get_addr", PtrVT); ArgListTy Args; - ArgListEntry Entry; - Entry.Node = Argument; - Entry.Ty = PtrTy; - Args.push_back(Entry); + Args.emplace_back(Argument, PtrTy); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(DL) @@ -3040,14 +3037,13 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op, static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, ArrayRef<MCPhysReg> F64Regs) { + Type *OrigTy, CCState &State, + ArrayRef<MCPhysReg> F64Regs) { const MipsSubtarget &Subtarget = static_cast<const MipsSubtarget &>( State.getMachineFunction().getSubtarget()); static const MCPhysReg IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 }; - const MipsCCState * MipsState = static_cast<MipsCCState *>(&State); - static const MCPhysReg F32Regs[] = { Mips::F12, Mips::F14 }; static const MCPhysReg FloatVectorIntRegs[] = { Mips::A0, Mips::A2 }; @@ -3089,7 +3085,7 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT, State.getFirstUnallocated(F32Regs) != ValNo; Align OrigAlign = ArgFlags.getNonZeroOrigAlign(); bool isI64 = (ValVT == MVT::i32 && OrigAlign == Align(8)); - bool isVectorFloat = MipsState->WasOriginalArgVectorFloat(ValNo); + bool isVectorFloat = OrigTy->isVectorTy() && OrigTy->isFPOrFPVectorTy(); // The MIPS vector ABI for floats passes them in a pair of registers if (ValVT == MVT::i32 && isVectorFloat) { @@ -3160,25 +3156,29 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT, return false; } -static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, - MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { +static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State) { static const MCPhysReg F64Regs[] = { Mips::D6, Mips::D7 }; - return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs); + return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, OrigTy, State, + F64Regs); } -static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, - MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { +static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State) { static const MCPhysReg F64Regs[] = { Mips::D12_64, Mips::D14_64 }; - return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs); + return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, OrigTy, State, + F64Regs); } static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State) LLVM_ATTRIBUTE_UNUSED; + Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED; #include "MipsGenCallingConv.inc" @@ -3392,8 +3392,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MemcpyInByVal ? 0 : ABI.GetCalleeAllocdArgSizeInBytes(CallConv); CCInfo.AllocateStack(ReservedArgArea, Align(1)); - CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(), - ES ? ES->getSymbol() : nullptr); + CCInfo.AnalyzeCallOperands(Outs, CC_Mips); // Get a count of how many bytes are to be pushed on the stack. unsigned StackSize = CCInfo.getStackSize(); @@ -3688,10 +3687,7 @@ SDValue MipsTargetLowering::LowerCallResult( MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); - const ExternalSymbolSDNode *ES = - dyn_cast_or_null<const ExternalSymbolSDNode>(CLI.Callee.getNode()); - CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI.RetTy, - ES ? ES->getSymbol() : nullptr); + CCInfo.AnalyzeCallResult(Ins, RetCC_Mips); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { @@ -3969,7 +3965,7 @@ MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv, LLVMContext &Context, const Type *RetTy) const { SmallVector<CCValAssign, 16> RVLocs; MipsCCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); - return CCInfo.CheckCallReturn(Outs, RetCC_Mips, RetTy); + return CCInfo.CheckReturn(Outs, RetCC_Mips); } bool MipsTargetLowering::shouldSignExtendTypeInLibCall(Type *Ty, @@ -4408,7 +4404,7 @@ void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'K': // unsigned 16 bit immediate if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { EVT Type = Op.getValueType(); - uint64_t Val = (uint64_t)C->getZExtValue(); + uint64_t Val = C->getZExtValue(); if (isUInt<16>(Val)) { Result = DAG.getTargetConstant(Val, DL, Type); break; diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 38912a7..0c581dcc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -1458,7 +1458,6 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters( // Map the global virtual register number to a register class specific // virtual register number starting from 1 with that class. const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - //unsigned numRegClasses = TRI->getNumRegClasses(); // Emit the Fake Stack Object const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1479,13 +1478,12 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters( // global virtual // register number and the per class virtual register number. // We use the per class virtual register number in the ptx output. - unsigned int numVRs = MRI->getNumVirtRegs(); - for (unsigned i = 0; i < numVRs; i++) { - Register vr = Register::index2VirtReg(i); - const TargetRegisterClass *RC = MRI->getRegClass(vr); - DenseMap<unsigned, unsigned> ®map = VRegMapping[RC]; - int n = regmap.size(); - regmap.insert(std::make_pair(vr, n + 1)); + for (unsigned I : llvm::seq(MRI->getNumVirtRegs())) { + Register VR = Register::index2VirtReg(I); + if (MRI->use_empty(VR) && MRI->def_empty(VR)) + continue; + auto &RCRegMap = VRegMapping[MRI->getRegClass(VR)]; + RCRegMap[VR] = RCRegMap.size() + 1; } // Emit declaration of the virtual registers or 'physical' registers for diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 6068035..520ce4d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -70,7 +70,7 @@ NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const { } bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const { - return Subtarget->getTargetLowering()->usePrecSqrtF32(*MF, N); + return Subtarget->getTargetLowering()->usePrecSqrtF32(N); } bool NVPTXDAGToDAGISel::useF32FTZ() const { @@ -82,11 +82,6 @@ bool NVPTXDAGToDAGISel::allowFMA() const { return TL->allowFMA(*MF, OptLevel); } -bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const { - const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); - return TL->allowUnsafeFPMath(*MF); -} - bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; } /// Select - Select instructions not customized! Used for @@ -1027,6 +1022,72 @@ pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i16, } } +static inline bool isAddLike(const SDValue V) { + return V.getOpcode() == ISD::ADD || + (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint()); +} + +static SDValue stripAssertAlign(SDValue N) { + if (N.getOpcode() == ISD::AssertAlign) + N = N.getOperand(0); + return N; +} + +// selectBaseADDR - Match a dag node which will serve as the base address for an +// ADDR operand pair. +static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) { + N = stripAssertAlign(N); + if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N)) + return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), + GA->getValueType(0), GA->getOffset(), + GA->getTargetFlags()); + if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(N)) + return DAG->getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0), + ES->getTargetFlags()); + if (const auto *FIN = dyn_cast<FrameIndexSDNode>(N)) + return DAG->getTargetFrameIndex(FIN->getIndex(), FIN->getValueType(0)); + + return N; +} + +static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) { + Addr = stripAssertAlign(Addr); + APInt AccumulatedOffset(64u, 0); + while (isAddLike(Addr)) { + const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)); + if (!CN) + break; + + const APInt CI = CN->getAPIntValue().sext(64); + if (!(CI + AccumulatedOffset).isSignedIntN(32)) + break; + + AccumulatedOffset += CI; + Addr = stripAssertAlign(Addr->getOperand(0)); + } + return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL, + MVT::i32); +} + +static std::pair<SDValue, SDValue> selectADDR(SDValue Addr, SelectionDAG *DAG) { + SDValue Offset = accumulateOffset(Addr, SDLoc(Addr), DAG); + SDValue Base = selectBaseADDR(Addr, DAG); + return {Base, Offset}; +} + +// Select a pair of operands which represent a valid PTX address, this could be +// one of the following things: +// - [var] - Offset is simply set to 0 +// - [reg] - Offset is simply set to 0 +// - [reg+immOff] +// - [var+immOff] +// Note that immOff must fit into a 32-bit signed integer. +bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base, + SDValue &Offset) { + std::tie(Base, Offset) = selectADDR(Addr, CurDAG); + return true; +} + bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { MemSDNode *LD = cast<MemSDNode>(N); assert(LD->readMem() && "Expected load"); @@ -1062,8 +1123,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { FromTypeWidth <= 128 && "Invalid width for load"); // Create the machine instruction DAG - SDValue Offset, Base; - SelectADDR(N->getOperand(1), Base, Offset); + const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG); SDValue Ops[] = {getI32Imm(Ordering, DL), getI32Imm(Scope, DL), getI32Imm(CodeAddrSpace, DL), @@ -1144,8 +1204,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 && FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load"); - SDValue Offset, Base; - SelectADDR(N->getOperand(1), Base, Offset); + const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG); SDValue Ops[] = {getI32Imm(Ordering, DL), getI32Imm(Scope, DL), getI32Imm(CodeAddrSpace, DL), @@ -1213,8 +1272,7 @@ bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) { assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 && FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load"); - SDValue Base, Offset; - SelectADDR(LD->getOperand(1), Base, Offset); + const auto [Base, Offset] = selectADDR(LD->getOperand(1), CurDAG); SDValue Ops[] = {getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base, Offset, LD->getChain()}; @@ -1278,8 +1336,7 @@ bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) { SDValue Addr = LD->getOperand(LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1); - SDValue Base, Offset; - SelectADDR(Addr, Base, Offset); + const auto [Base, Offset] = selectADDR(Addr, CurDAG); SDValue Ops[] = {getI32Imm(FromTypeWidth, DL), Base, Offset, LD->getChain()}; std::optional<unsigned> Opcode; @@ -1339,9 +1396,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 && "Invalid width for store"); - SDValue Offset, Base; - SelectADDR(ST->getBasePtr(), Base, Offset); - + const auto [Base, Offset] = selectADDR(ST->getBasePtr(), CurDAG); SDValue Ops[] = {selectPossiblyImm(Value), getI32Imm(Ordering, DL), getI32Imm(Scope, DL), @@ -1399,9 +1454,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for store"); - SDValue Offset, Base; - SelectADDR(Addr, Base, Offset); - + const auto [Base, Offset] = selectADDR(Addr, CurDAG); Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(ToTypeWidth, DL), Base, Offset, Chain}); @@ -1708,59 +1761,6 @@ bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) { return true; } -static inline bool isAddLike(const SDValue V) { - return V.getOpcode() == ISD::ADD || - (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint()); -} - -// selectBaseADDR - Match a dag node which will serve as the base address for an -// ADDR operand pair. -static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) { - if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N)) - return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), - GA->getValueType(0), GA->getOffset(), - GA->getTargetFlags()); - if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(N)) - return DAG->getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0), - ES->getTargetFlags()); - if (const auto *FIN = dyn_cast<FrameIndexSDNode>(N)) - return DAG->getTargetFrameIndex(FIN->getIndex(), FIN->getValueType(0)); - - return N; -} - -static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) { - APInt AccumulatedOffset(64u, 0); - while (isAddLike(Addr)) { - const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)); - if (!CN) - break; - - const APInt CI = CN->getAPIntValue().sext(64); - if (!(CI + AccumulatedOffset).isSignedIntN(32)) - break; - - AccumulatedOffset += CI; - Addr = Addr->getOperand(0); - } - return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL, - MVT::i32); -} - -// Select a pair of operands which represent a valid PTX address, this could be -// one of the following things: -// - [var] - Offset is simply set to 0 -// - [reg] - Offset is simply set to 0 -// - [reg+immOff] -// - [var+immOff] -// Note that immOff must fit into a 32-bit signed integer. -bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base, - SDValue &Offset) { - Offset = accumulateOffset(Addr, SDLoc(Addr), CurDAG); - Base = selectBaseADDR(Addr, CurDAG); - return true; -} - SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) { if (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); @@ -1774,37 +1774,20 @@ SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) { return V; } -bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N, - unsigned int spN) const { - const Value *Src = nullptr; - if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) { - if (spN == 0 && mN->getMemOperand()->getPseudoValue()) - return true; - Src = mN->getMemOperand()->getValue(); - } - if (!Src) - return false; - if (auto *PT = dyn_cast<PointerType>(Src->getType())) - return (PT->getAddressSpace() == spN); - return false; -} - /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector<SDValue> &OutOps) { - SDValue Op0, Op1; switch (ConstraintID) { default: return true; - case InlineAsm::ConstraintCode::m: // memory - if (SelectADDR(Op, Op0, Op1)) { - OutOps.push_back(Op0); - OutOps.push_back(Op1); - return false; - } - break; + case InlineAsm::ConstraintCode::m: { // memory + const auto [Base, Offset] = selectADDR(Op, CurDAG); + OutOps.push_back(Base); + OutOps.push_back(Offset); + return false; + } } return true; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 9e0f88e5..6573172 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -44,7 +44,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool usePrecSqrtF32(const SDNode *N) const; bool useF32FTZ() const; bool allowFMA() const; - bool allowUnsafeFPMath() const; bool doRsqrtOpt() const; NVPTXScopes Scopes{}; @@ -102,8 +101,6 @@ private: SDValue getPTXCmpMode(const CondCodeSDNode &CondCode); SDValue selectPossiblyImm(SDValue V); - bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const; - // Returns the Memory Order and Scope that the PTX memory instruction should // use, and inserts appropriate fence instruction before the memory // instruction, if needed to implement the instructions memory order. Required diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d4f0cc9..74e6c13 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -28,7 +28,6 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" @@ -125,10 +124,6 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF, if (UsePrecDivF32.getNumOccurrences() > 0) return UsePrecDivF32; - // Otherwise, use div.approx if fast math is enabled - if (allowUnsafeFPMath(MF)) - return NVPTX::DivPrecisionLevel::Approx; - const SDNodeFlags Flags = N.getFlags(); if (Flags.hasApproximateFuncs()) return NVPTX::DivPrecisionLevel::Approx; @@ -136,16 +131,11 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF, return NVPTX::DivPrecisionLevel::IEEE754; } -bool NVPTXTargetLowering::usePrecSqrtF32(const MachineFunction &MF, - const SDNode *N) const { +bool NVPTXTargetLowering::usePrecSqrtF32(const SDNode *N) const { // If nvptx-prec-sqrtf32 is used on the command-line, always honor it if (UsePrecSqrtF32.getNumOccurrences() > 0) return UsePrecSqrtF32; - // Otherwise, use sqrt.approx if fast math is enabled - if (allowUnsafeFPMath(MF)) - return false; - if (N) { const SDNodeFlags Flags = N->getFlags(); if (Flags.hasApproximateFuncs()) @@ -680,6 +670,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // No support for these operations with v2f32. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Expand); + // Need custom lowering in case the index is dynamic. + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); // Custom conversions to/from v2i8. setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); @@ -1191,8 +1183,7 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, bool &UseOneConst, bool Reciprocal) const { if (!(Enabled == ReciprocalEstimate::Enabled || - (Enabled == ReciprocalEstimate::Unspecified && - !usePrecSqrtF32(DAG.getMachineFunction())))) + (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) return SDValue(); if (ExtraSteps == ReciprocalEstimate::Unspecified) @@ -2849,8 +2840,7 @@ static SDValue lowerROT(SDValue Op, SelectionDAG &DAG) { SDLoc(Op), Opcode, DAG); } -static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG, - bool AllowUnsafeFPMath) { +static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG) { // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)), // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches // the semantics of LLVM's frem. @@ -2867,7 +2857,7 @@ static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG, SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul, Flags | SDNodeFlags::AllowContract); - if (AllowUnsafeFPMath || Flags.hasNoInfs()) + if (Flags.hasNoInfs()) return Sub; // If Y is infinite, return X @@ -3012,7 +3002,7 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTLZ: return lowerCTLZCTPOP(Op, DAG); case ISD::FREM: - return lowerFREM(Op, DAG, allowUnsafeFPMath(DAG.getMachineFunction())); + return lowerFREM(Op, DAG); default: llvm_unreachable("Custom lowering not defined for operation"); @@ -4866,17 +4856,7 @@ bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) return true; - return allowUnsafeFPMath(MF); -} - -bool NVPTXTargetLowering::allowUnsafeFPMath(const MachineFunction &MF) const { - // Honor TargetOptions flags that explicitly say unsafe math is okay. - if (MF.getTarget().Options.UnsafeFPMath) - return true; - - // Allow unsafe math if unsafe-fp-math attribute explicitly says so. - const Function &F = MF.getFunction(); - return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); + return false; } static bool isConstZero(const SDValue &Operand) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 43e721a..27f099e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -206,8 +206,7 @@ public: // Get whether we should use a precise or approximate 32-bit floating point // sqrt instruction. - bool usePrecSqrtF32(const MachineFunction &MF, - const SDNode *N = nullptr) const; + bool usePrecSqrtF32(const SDNode *N = nullptr) const; // Get whether we should use instructions that flush floating-point denormals // to sign-preserving zero. @@ -220,7 +219,6 @@ public: unsigned combineRepeatedFPDivisors() const override { return 2; } bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const; - bool allowUnsafeFPMath(const MachineFunction &MF) const; bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT) const override { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 1ab41bf..7b13509 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -148,13 +148,16 @@ class OneUse2<SDPatternOperator operator> : PatFrag<(ops node:$A, node:$B), (operator node:$A, node:$B), [{ return N->hasOneUse(); }]>; -class fpimm_pos_inf<ValueType vt> - : FPImmLeaf<vt, [{ return Imm.isPosInfinity(); }]>; - class zeroinitializer<ValueType vt> : PatLeaf<(vt (bitconvert (!cast<ValueType>("i" # vt.Size) 0)))>; +def fpimm_pos_inf : FPImmLeaf<fAny, [{ return Imm.isPosInfinity(); }]>; +def fpimm_0 : FPImmLeaf<fAny, [{ return Imm.isZero(); }]>; +def fpimm_1 : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(1.0); }]>; +def fpimm_neg_1 : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(-1.0); }]>; + + // Operands which can hold a Register or an Immediate. // // Unfortunately, since most register classes can hold multiple types, we must @@ -268,7 +271,7 @@ multiclass I3Inst<string op_str, SDPatternOperator op_node, RegTyInfo t, // The instructions are named "<OpcStr><Width>" (e.g. "add.s64"). multiclass I3<string op_str, SDPatternOperator op_node, bit commutative> { foreach t = [I16RT, I32RT, I64RT] in - defm t.Ty# : I3Inst<op_str # t.Size, op_node, t, commutative>; + defm t.Size# : I3Inst<op_str # t.Size, op_node, t, commutative>; } class I16x2<string OpcStr, SDNode OpNode> : @@ -761,10 +764,10 @@ def fabs_oneuse : OneUse1<fabs>; def TESTINF_f32r : BasicNVPTXInst<(outs B1:$p), (ins B32:$a), "testp.infinite.f32", - [(set i1:$p, (seteq (fabs_oneuse f32:$a), fpimm_pos_inf<f32>))]>; + [(set i1:$p, (seteq (fabs_oneuse f32:$a), fpimm_pos_inf))]>; def TESTINF_f64r : BasicNVPTXInst<(outs B1:$p), (ins B64:$a), "testp.infinite.f64", - [(set i1:$p, (seteq (fabs_oneuse f64:$a), fpimm_pos_inf<f64>))]>; + [(set i1:$p, (seteq (fabs_oneuse f64:$a), fpimm_pos_inf))]>; //----------------------------------- // Integer Arithmetic @@ -787,8 +790,8 @@ defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube, commutative = false>; defm MULT : I3<"mul.lo.s", mul, commutative = true>; -defm MULTHS : I3<"mul.hi.s", mulhs, commutative = true>; -defm MULTHU : I3<"mul.hi.u", mulhu, commutative = true>; +defm MUL_HI_S : I3<"mul.hi.s", mulhs, commutative = true>; +defm MUL_HI_U : I3<"mul.hi.u", mulhu, commutative = true>; defm SDIV : I3<"div.s", sdiv, commutative = false>; defm UDIV : I3<"div.u", udiv, commutative = false>; @@ -905,22 +908,6 @@ let Predicates = [hasOptEnabled] in { // Floating Point Arithmetic //----------------------------------- -// Constant 1.0f -def f32imm_1 : FPImmLeaf<f32, [{ - return &Imm.getSemantics() == &llvm::APFloat::IEEEsingle() && - Imm.convertToFloat() == 1.0f; -}]>; -// Constant 1.0 (double) -def f64imm_1 : FPImmLeaf<f64, [{ - return &Imm.getSemantics() == &llvm::APFloat::IEEEdouble() && - Imm.convertToDouble() == 1.0; -}]>; -// Constant -1.0 (double) -def f64imm_neg1 : FPImmLeaf<f64, [{ - return &Imm.getSemantics() == &llvm::APFloat::IEEEdouble() && - Imm.convertToDouble() == -1.0; -}]>; - defm FADD : F3_fma_component<"add", fadd>; defm FSUB : F3_fma_component<"sub", fsub>; defm FMUL : F3_fma_component<"mul", fmul>; @@ -994,7 +981,7 @@ def FRCP64r : BasicNVPTXInst<(outs B64:$dst), (ins B64:$b), "rcp.rn.f64", - [(set f64:$dst, (fdiv f64imm_1, f64:$b))]>; + [(set f64:$dst, (fdiv fpimm_1, f64:$b))]>; def FDIV64rr : BasicNVPTXInst<(outs B64:$dst), (ins B64:$a, B64:$b), @@ -1008,7 +995,7 @@ def FDIV64ri : // fdiv will be converted to rcp // fneg (fdiv 1.0, X) => fneg (rcp.rn X) -def : Pat<(fdiv f64imm_neg1, f64:$b), +def : Pat<(fdiv fpimm_neg_1, f64:$b), (FNEGf64 (FRCP64r $b))>; // @@ -1021,21 +1008,21 @@ def fdiv_approx : PatFrag<(ops node:$a, node:$b), }]>; -def FRCP32_approx_r : +def RCP_APPROX_F32_r : BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$b), (ins FTZFlag:$ftz), "rcp.approx$ftz.f32", - [(set f32:$dst, (fdiv_approx f32imm_1, f32:$b))]>; + [(set f32:$dst, (fdiv_approx fpimm_1, f32:$b))]>; // // F32 Approximate division // -def FDIV32_approx_rr : +def DIV_APPROX_F32_rr : BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b), (ins FTZFlag:$ftz), "div.approx$ftz.f32", [(set f32:$dst, (fdiv_approx f32:$a, f32:$b))]>; -def FDIV32_approx_ri : +def DIV_APPROX_F32_ri : BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$a, f32imm:$b), (ins FTZFlag:$ftz), "div.approx$ftz.f32", @@ -1052,8 +1039,8 @@ def fdiv_full : PatFrag<(ops node:$a, node:$b), }]>; -def : Pat<(fdiv_full f32imm_1, f32:$b), - (FRCP32_approx_r $b)>; +def : Pat<(fdiv_full fpimm_1, f32:$b), + (RCP_APPROX_F32_r $b)>; // // F32 Semi-accurate division @@ -1081,7 +1068,7 @@ def FRCP32r_prec : BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$b), (ins FTZFlag:$ftz), "rcp.rn$ftz.f32", - [(set f32:$dst, (fdiv_ftz f32imm_1, f32:$b))]>; + [(set f32:$dst, (fdiv_ftz fpimm_1, f32:$b))]>; // // F32 Accurate division // @@ -1096,7 +1083,7 @@ def FDIV32ri_prec : "div.rn$ftz.f32", [(set f32:$dst, (fdiv_ftz f32:$a, fpimm:$b))]>; -def : Pat<(fdiv f32imm_1, f32:$b), (FRCP32r_prec $b, NoFTZ)>; +def : Pat<(fdiv fpimm_1, f32:$b), (FRCP32r_prec $b, NoFTZ)>; def : Pat<(fdiv f32:$a, f32:$b), (FDIV32rr_prec $a, $b, NoFTZ)>; def : Pat<(fdiv f32:$a, fpimm:$b), (FDIV32ri_prec $a, fpimm:$b, NoFTZ)>; @@ -1146,9 +1133,8 @@ defm FMA_F64 : FMA<F64RT, allow_ftz = false>; // sin/cos/tanh class UnaryOpAllowsApproxFn<SDPatternOperator operator> - : PatFrag<(ops node:$A), - (operator node:$A), [{ - return allowUnsafeFPMath() || N->getFlags().hasApproximateFuncs(); + : PatFrag<(ops node:$A), (operator node:$A), [{ + return N->getFlags().hasApproximateFuncs(); }]>; def SIN_APPROX_f32 : @@ -1519,23 +1505,28 @@ def MmaCode : Operand<i32> { // Get pointer to local stack. let hasSideEffects = false in { def MOV_DEPOT_ADDR : NVPTXInst<(outs B32:$d), (ins i32imm:$num), - "mov.b32 \t$d, __local_depot$num;", []>; + "mov.b32 \t$d, __local_depot$num;">; def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs B64:$d), (ins i32imm:$num), - "mov.b64 \t$d, __local_depot$num;", []>; + "mov.b64 \t$d, __local_depot$num;">; } - -// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp -let hasSideEffects = false, isAsCheapAsAMove = true in { - let isMoveReg = true in +let hasSideEffects = false in { + let isMoveReg = true, isAsCheapAsAMove = true in class MOVr<RegisterClass RC, string OpStr> : BasicNVPTXInst<(outs RC:$dst), (ins RC:$src), "mov." # OpStr>; - let isMoveImm = true in + let isMoveImm = true, isAsCheapAsAMove = true in class MOVi<RegTyInfo t, string suffix> : BasicNVPTXInst<(outs t.RC:$dst), (ins t.Imm:$src), "mov." # suffix, [(set t.Ty:$dst, t.ImmNode:$src)]>; + + // We don't want to set isAsCheapAsAMove to true for these instructions as + // this would prevent CSE and resulted in regressions (see discussion after + // PR-145581 in llvm-project). + class MovSymInst<RegTyInfo t> : + BasicNVPTXInst<(outs t.RC:$dst), (ins Operand<t.Ty>:$src), + "mov.b" # t.Size>; } def MOV_B1_r : MOVr<B1, "pred">; @@ -1553,6 +1544,9 @@ def MOV_BF16_i : MOVi<BF16RT, "b16">; def MOV_F32_i : MOVi<F32RT, "b32">; def MOV_F64_i : MOVi<F64RT, "b64">; +def MOV_B32_sym : MovSymInst<I32RT>; +def MOV_B64_sym : MovSymInst<I64RT>; + def to_tglobaladdr : SDNodeXForm<globaladdr, [{ return CurDAG->getTargetGlobalAddress(N->getGlobal(), SDLoc(N), @@ -1569,17 +1563,17 @@ def to_tframeindex : SDNodeXForm<frameindex, [{ return CurDAG->getTargetFrameIndex(N->getIndex(), N->getValueType(0)); }]>; -def : Pat<(i32 globaladdr:$dst), (MOV_B32_i (to_tglobaladdr $dst))>; -def : Pat<(i64 globaladdr:$dst), (MOV_B64_i (to_tglobaladdr $dst))>; +def : Pat<(i32 globaladdr:$dst), (MOV_B32_sym (to_tglobaladdr $dst))>; +def : Pat<(i64 globaladdr:$dst), (MOV_B64_sym (to_tglobaladdr $dst))>; -def : Pat<(i32 externalsym:$dst), (MOV_B32_i (to_texternsym $dst))>; -def : Pat<(i64 externalsym:$dst), (MOV_B64_i (to_texternsym $dst))>; +def : Pat<(i32 externalsym:$dst), (MOV_B32_sym (to_texternsym $dst))>; +def : Pat<(i64 externalsym:$dst), (MOV_B64_sym (to_texternsym $dst))>; //---- Copy Frame Index ---- def LEA_ADDRi : NVPTXInst<(outs B32:$dst), (ins ADDR:$addr), - "add.u32 \t$dst, ${addr:add};", []>; + "add.u32 \t$dst, ${addr:add};">; def LEA_ADDRi64 : NVPTXInst<(outs B64:$dst), (ins ADDR:$addr), - "add.u64 \t$dst, ${addr:add};", []>; + "add.u64 \t$dst, ${addr:add};">; def : Pat<(i32 frameindex:$fi), (LEA_ADDRi (to_tframeindex $fi), 0)>; def : Pat<(i64 frameindex:$fi), (LEA_ADDRi64 (to_tframeindex $fi), 0)>; @@ -1644,12 +1638,12 @@ foreach is_convergent = [0, 1] in { NVPTXInst<(outs), (ins ADDR_base:$addr, CallOperand:$rets, CallOperand:$params, i32imm:$proto), - "call${rets:RetList} $addr, (${params:ParamList}), prototype_$proto;", []>; + "call${rets:RetList} $addr, (${params:ParamList}), prototype_$proto;">; def CALL_UNI # convergent_suffix : NVPTXInst<(outs), (ins ADDR_base:$addr, CallOperand:$rets, CallOperand:$params), - "call.uni${rets:RetList} $addr, (${params:ParamList});", []>; + "call.uni${rets:RetList} $addr, (${params:ParamList});">; } defvar call_inst = !cast<NVPTXInst>("CALL" # convergent_suffix); @@ -1665,10 +1659,10 @@ foreach is_convergent = [0, 1] in { def DECLARE_PARAM_array : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size), - ".param .align $align .b8 \t$a[$size];", []>; + ".param .align $align .b8 \t$a[$size];">; def DECLARE_PARAM_scalar : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), - ".param .b$size \t$a;", []>; + ".param .b$size \t$a;">; def : Pat<(declare_array_param externalsym:$a, imm:$align, imm:$size), (DECLARE_PARAM_array (to_texternsym $a), imm:$align, imm:$size)>; @@ -1741,7 +1735,7 @@ class LD<NVPTXRegClass regclass> (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$fromWidth " - "\t$dst, [$addr];", []>; + "\t$dst, [$addr];">; let mayLoad=1, hasSideEffects=0 in { def LD_i16 : LD<B16>; @@ -1756,7 +1750,7 @@ class ST<DAGOperand O> AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$toWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.b$toWidth" - " \t[$addr], $src;", []>; + " \t[$addr], $src;">; let mayStore=1, hasSideEffects=0 in { def ST_i16 : ST<RI16>; @@ -1773,13 +1767,13 @@ multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> { (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2}}, [$addr];", []>; + "\t{{$dst1, $dst2}}, [$addr];">; def _v4 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];">; if support_v8 then def _v8 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4, @@ -1788,7 +1782,7 @@ multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> { i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, " - "[$addr];", []>; + "[$addr];">; } let mayLoad=1, hasSideEffects=0 in { defm LDV_i16 : LD_VEC<B16>; @@ -1803,14 +1797,14 @@ multiclass ST_VEC<DAGOperand O, bit support_v8 = false> { AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.v2.b$fromWidth " - "\t[$addr], {{$src1, $src2}};", []>; + "\t[$addr], {{$src1, $src2}};">; def _v4 : NVPTXInst< (outs), (ins O:$src1, O:$src2, O:$src3, O:$src4, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.v4.b$fromWidth " - "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; + "\t[$addr], {{$src1, $src2, $src3, $src4}};">; if support_v8 then def _v8 : NVPTXInst< (outs), @@ -1820,7 +1814,7 @@ multiclass ST_VEC<DAGOperand O, bit support_v8 = false> { ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.v8.b$fromWidth " "\t[$addr], " - "{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, $src8}};", []>; + "{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, $src8}};">; } let mayStore=1, hasSideEffects=0 in { @@ -2015,60 +2009,52 @@ let hasSideEffects = false in { def V4I16toI64 : NVPTXInst<(outs B64:$d), (ins B16:$s1, B16:$s2, B16:$s3, B16:$s4), - "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>; + "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};">; def V2I16toI32 : NVPTXInst<(outs B32:$d), (ins B16:$s1, B16:$s2), - "mov.b32 \t$d, {{$s1, $s2}};", []>; + "mov.b32 \t$d, {{$s1, $s2}};">; def V2I32toI64 : NVPTXInst<(outs B64:$d), (ins B32:$s1, B32:$s2), - "mov.b64 \t$d, {{$s1, $s2}};", []>; + "mov.b64 \t$d, {{$s1, $s2}};">; def V2I64toI128 : NVPTXInst<(outs B128:$d), (ins B64:$s1, B64:$s2), - "mov.b128 \t$d, {{$s1, $s2}};", []>; + "mov.b128 \t$d, {{$s1, $s2}};">; // unpack a larger int register to a set of smaller int registers def I64toV4I16 : NVPTXInst<(outs B16:$d1, B16:$d2, B16:$d3, B16:$d4), (ins B64:$s), - "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>; + "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;">; def I32toV2I16 : NVPTXInst<(outs B16:$d1, B16:$d2), (ins B32:$s), - "mov.b32 \t{{$d1, $d2}}, $s;", []>; + "mov.b32 \t{{$d1, $d2}}, $s;">; def I64toV2I32 : NVPTXInst<(outs B32:$d1, B32:$d2), (ins B64:$s), - "mov.b64 \t{{$d1, $d2}}, $s;", []>; + "mov.b64 \t{{$d1, $d2}}, $s;">; def I128toV2I64: NVPTXInst<(outs B64:$d1, B64:$d2), (ins B128:$s), - "mov.b128 \t{{$d1, $d2}}, $s;", []>; + "mov.b128 \t{{$d1, $d2}}, $s;">; - def I32toI16H : NVPTXInst<(outs B16:$high), - (ins B32:$s), - "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}", - []>; - def I32toI16L : NVPTXInst<(outs B16:$low), - (ins B32:$s), - "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}", - []>; - def I64toI32H : NVPTXInst<(outs B32:$high), - (ins B64:$s), - "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}", - []>; - def I64toI32L : NVPTXInst<(outs B32:$low), - (ins B64:$s), - "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}", - []>; + def I32toI16H : NVPTXInst<(outs B16:$high), (ins B32:$s), + "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}">; + def I32toI16L : NVPTXInst<(outs B16:$low), (ins B32:$s), + "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}">; + def I64toI32H : NVPTXInst<(outs B32:$high), (ins B64:$s), + "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}">; + def I64toI32L : NVPTXInst<(outs B32:$low), (ins B64:$s), + "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}">; // PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the // unused high/low part. let Predicates = [hasPTX<71>] in { def I32toI16H_Sink : NVPTXInst<(outs B16:$high), (ins B32:$s), - "mov.b32 \t{{_, $high}}, $s;", []>; + "mov.b32 \t{{_, $high}}, $s;">; def I32toI16L_Sink : NVPTXInst<(outs B16:$low), (ins B32:$s), - "mov.b32 \t{{$low, _}}, $s;", []>; + "mov.b32 \t{{$low, _}}, $s;">; def I64toI32H_Sink : NVPTXInst<(outs B32:$high), (ins B64:$s), - "mov.b64 \t{{_, $high}}, $s;", []>; + "mov.b64 \t{{_, $high}}, $s;">; def I64toI32L_Sink : NVPTXInst<(outs B32:$low), (ins B64:$s), - "mov.b64 \t{{$low, _}}, $s;", []>; + "mov.b64 \t{{$low, _}}, $s;">; } } @@ -2426,10 +2412,6 @@ foreach scope = ["sys", "gpu", "cluster", "cta"] in { def atomic_thread_fence_release_#scope: NVPTXFenceInst<scope, "release", hasPTX<87>>; } -def fpimm_any_zero : FPImmLeaf<fAny, [{ - return Imm.isZero(); -}]>; - // Perform substitution if fma only has one use, and also if instruction has // nnan instruction flag or if the TM has NoNaNsFPMath def NVPTX_fma_oneuse_and_nnan : PatFrag<(ops node:$a, node:$b, node:$c), @@ -2451,11 +2433,11 @@ class FMARELUInst<RegTyInfo t, bit allow_ftz, PatFrag zero_pat> [(set t.Ty:$dst, (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan t.Ty:$a, t.Ty:$b, t.Ty:$c), zero_pat))]>; let Predicates = [useFP16Math, hasPTX<70>, hasSM<80>] in { - def FMARELU_F16 : FMARELUInst<F16RT, true, fpimm_any_zero>; + def FMARELU_F16 : FMARELUInst<F16RT, true, fpimm_0>; def FMARELU_F16X2 : FMARELUInst<F16X2RT, true, zeroinitializer<v2f16>>; } let Predicates = [hasBF16Math, hasPTX<70>, hasSM<80>] in { - def FMARELU_BF16 : FMARELUInst<BF16RT, false, fpimm_any_zero>; + def FMARELU_BF16 : FMARELUInst<BF16RT, false, fpimm_0>; def FMARELU_BF16X2 : FMARELUInst<BF16X2RT, false, zeroinitializer<v2bf16>>; } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index d4a0ca7..721afae 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -6,44 +6,24 @@ // //===----------------------------------------------------------------------===// -def immFloat0 : PatLeaf<(fpimm), [{ - float f = (float)N->getValueAPF().convertToFloat(); - return (f==0.0f); -}]>; - -def immFloat1 : PatLeaf<(fpimm), [{ - float f = (float)N->getValueAPF().convertToFloat(); - return (f==1.0f); -}]>; - -def immDouble0 : PatLeaf<(fpimm), [{ - double d = (double)N->getValueAPF().convertToDouble(); - return (d==0.0); -}]>; - -def immDouble1 : PatLeaf<(fpimm), [{ - double d = (double)N->getValueAPF().convertToDouble(); - return (d==1.0); -}]>; - def AS_match { code generic = [{ - return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC); + return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC; }]; code shared = [{ - return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED); + return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_SHARED; }]; code shared_cluster = [{ - return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED_CLUSTER); + return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_SHARED_CLUSTER; }]; code global = [{ - return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL); + return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL; }]; code const = [{ - return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST); + return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_CONST; }]; code param = [{ - return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM); + return cast<MemSDNode>(N)->getAddressSpace() == llvm::ADDRESS_SPACE_PARAM; }]; } @@ -659,22 +639,22 @@ multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode> def "" : NVPTXInst<(outs), !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";"), []>, + !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";")>, Requires<[hasPTX<80>, hasSM<90>]>; def _MC : NVPTXInst<(outs), !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins B16:$mc, CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;"), []>, + !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;")>, Requires<[hasPTX<80>, hasSM<90>]>; def _CH : NVPTXInst<(outs), !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins B64:$ch, CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;"), []>, + !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;")>, Requires<[hasPTX<80>, hasSM<90>]>; def _MC_CH : NVPTXInst<(outs), !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;"), []>, + !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;")>, Requires<[hasPTX<80>, hasSM<90>]>; } @@ -876,11 +856,11 @@ multiclass CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, bit shared32, string mode> def "" : NVPTXInst<(outs), !con((ins rc:$src, B64:$tmap), dims_dag, (ins TMAReductionFlags:$red_op)), - !strconcat(prefix, "${red_op}", suffix, asm_str, ";"), []>, + !strconcat(prefix, "${red_op}", suffix, asm_str, ";")>, Requires<[hasPTX<80>, hasSM<90>]>; def _CH : NVPTXInst<(outs), !con((ins rc:$src, B64:$tmap), dims_dag, (ins B64:$ch, TMAReductionFlags:$red_op)), - !strconcat(prefix, "${red_op}", suffix, ".L2::cache_hint", asm_str, ", $ch;"), []>, + !strconcat(prefix, "${red_op}", suffix, ".L2::cache_hint", asm_str, ", $ch;")>, Requires<[hasPTX<80>, hasSM<90>]>; } @@ -1112,30 +1092,30 @@ let Predicates = [hasPTX<70>, hasSM<80>] in { // max(0.0, min(x, 1.0)) is 1.0 while sat(x) is 0. // Same story for fmax, fmin. -def : Pat<(int_nvvm_fmin_f immFloat1, - (int_nvvm_fmax_f immFloat0, f32:$a)), +def : Pat<(int_nvvm_fmin_f fpimm_1, + (int_nvvm_fmax_f fpimm_0, f32:$a)), (CVT_f32_f32 $a, CvtSAT)>; -def : Pat<(int_nvvm_fmin_f immFloat1, - (int_nvvm_fmax_f f32:$a, immFloat0)), +def : Pat<(int_nvvm_fmin_f fpimm_1, + (int_nvvm_fmax_f f32:$a, fpimm_0)), (CVT_f32_f32 $a, CvtSAT)>; def : Pat<(int_nvvm_fmin_f - (int_nvvm_fmax_f immFloat0, f32:$a), immFloat1), + (int_nvvm_fmax_f fpimm_0, f32:$a), fpimm_1), (CVT_f32_f32 $a, CvtSAT)>; def : Pat<(int_nvvm_fmin_f - (int_nvvm_fmax_f f32:$a, immFloat0), immFloat1), + (int_nvvm_fmax_f f32:$a, fpimm_0), fpimm_1), (CVT_f32_f32 $a, CvtSAT)>; -def : Pat<(int_nvvm_fmin_d immDouble1, - (int_nvvm_fmax_d immDouble0, f64:$a)), +def : Pat<(int_nvvm_fmin_d fpimm_1, + (int_nvvm_fmax_d fpimm_0, f64:$a)), (CVT_f64_f64 $a, CvtSAT)>; -def : Pat<(int_nvvm_fmin_d immDouble1, - (int_nvvm_fmax_d f64:$a, immDouble0)), +def : Pat<(int_nvvm_fmin_d fpimm_1, + (int_nvvm_fmax_d f64:$a, fpimm_0)), (CVT_f64_f64 $a, CvtSAT)>; def : Pat<(int_nvvm_fmin_d - (int_nvvm_fmax_d immDouble0, f64:$a), immDouble1), + (int_nvvm_fmax_d fpimm_0, f64:$a), fpimm_1), (CVT_f64_f64 $a, CvtSAT)>; def : Pat<(int_nvvm_fmin_d - (int_nvvm_fmax_d f64:$a, immDouble0), immDouble1), + (int_nvvm_fmax_d f64:$a, fpimm_0), fpimm_1), (CVT_f64_f64 $a, CvtSAT)>; @@ -1329,12 +1309,12 @@ defm INT_NVVM_FMAN : MIN_MAX<"max">; // Multiplication // -def INT_NVVM_MULHI_S : F_MATH_2<"mul.hi.s16", B16, B16, B16, int_nvvm_mulhi_s>; -def INT_NVVM_MULHI_US : F_MATH_2<"mul.hi.u16", B16, B16, B16, int_nvvm_mulhi_us>; -def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32", B32, B32, B32, int_nvvm_mulhi_i>; -def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32", B32, B32, B32, int_nvvm_mulhi_ui>; -def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64", B64, B64, B64, int_nvvm_mulhi_ll>; -def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64", B64, B64, B64, int_nvvm_mulhi_ull>; +def : Pat<(int_nvvm_mulhi_s i16:$a, i16:$b), (MUL_HI_S16rr $a, $b)>; +def : Pat<(int_nvvm_mulhi_us i16:$a, i16:$b), (MUL_HI_U16rr $a, $b)>; +def : Pat<(int_nvvm_mulhi_i i32:$a, i32:$b), (MUL_HI_S32rr $a, $b)>; +def : Pat<(int_nvvm_mulhi_ui i32:$a, i32:$b), (MUL_HI_U32rr $a, $b)>; +def : Pat<(int_nvvm_mulhi_ll i64:$a, i64:$b), (MUL_HI_S64rr $a, $b)>; +def : Pat<(int_nvvm_mulhi_ull i64:$a, i64:$b), (MUL_HI_U64rr $a, $b)>; def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32", B32, B32, B32, int_nvvm_mul_rn_ftz_f>; def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32", B32, B32, B32, int_nvvm_mul_rn_f>; @@ -1357,8 +1337,8 @@ def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32", B32, B32, B32, int_nvvm_mul24_u // Div // -def INT_NVVM_DIV_APPROX_FTZ_F : F_MATH_2<"div.approx.ftz.f32", B32, B32, B32, int_nvvm_div_approx_ftz_f>; -def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32", B32, B32, B32, int_nvvm_div_approx_f>; +def : Pat<(int_nvvm_div_approx_ftz_f f32:$a, f32:$b), (DIV_APPROX_F32_rr $a, $b, FTZ)>; +def : Pat<(int_nvvm_div_approx_f f32:$a, f32:$b), (DIV_APPROX_F32_rr $a, $b, NoFTZ)>; def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32", B32, B32, B32, int_nvvm_div_rn_ftz_f>; def INT_NVVM_DIV_RN_F : F_MATH_2<"div.rn.f32", B32, B32, B32, int_nvvm_div_rn_f>; @@ -1663,13 +1643,13 @@ def : Pat<(int_nvvm_rsqrt_approx_d f64:$a), (RSQRT_APPROX_f64 $a, NoFTZ)>; // 1.0f / sqrt_approx -> rsqrt_approx let Predicates = [doRsqrtOpt] in { - def : Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_f f32:$a)), + def : Pat<(fdiv fpimm_1, (int_nvvm_sqrt_approx_f f32:$a)), (RSQRT_APPROX_f32 $a, NoFTZ)>; - def : Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_ftz_f f32:$a)), + def : Pat<(fdiv fpimm_1, (int_nvvm_sqrt_approx_ftz_f f32:$a)), (RSQRT_APPROX_f32 $a, FTZ)>; // same for int_nvvm_sqrt_f when non-precision sqrt is requested - def : Pat<(fdiv f32imm_1, (fsqrt_approx f32:$a)), + def : Pat<(fdiv fpimm_1, (fsqrt_approx f32:$a)), (RSQRT_APPROX_f32 $a)>; } // @@ -2231,7 +2211,7 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">; class LDU_G<NVPTXRegClass regclass> : NVPTXInst<(outs regclass:$result), (ins i32imm:$fromWidth, ADDR:$src), - "ldu.global.b$fromWidth \t$result, [$src];", []>; + "ldu.global.b$fromWidth \t$result, [$src];">; def LDU_GLOBAL_i16 : LDU_G<B16>; def LDU_GLOBAL_i32 : LDU_G<B32>; @@ -2243,13 +2223,13 @@ def LDU_GLOBAL_i64 : LDU_G<B64>; class VLDU_G_ELE_V2<NVPTXRegClass regclass> : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), (ins i32imm:$fromWidth, ADDR:$src), - "ldu.global.v2.b$fromWidth \t{{$dst1, $dst2}}, [$src];", []>; + "ldu.global.v2.b$fromWidth \t{{$dst1, $dst2}}, [$src];">; class VLDU_G_ELE_V4<NVPTXRegClass regclass> : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins i32imm:$fromWidth, ADDR:$src), - "ldu.global.v4.b$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>; + "ldu.global.v4.b$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];">; def LDU_GLOBAL_v2i16 : VLDU_G_ELE_V2<B16>; @@ -2270,9 +2250,8 @@ def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<B32>; class LDG_G<NVPTXRegClass regclass> : NVPTXInst<(outs regclass:$result), (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), - "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>; + "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];">; -def LD_GLOBAL_NC_i8 : LDG_G<B16>; def LD_GLOBAL_NC_i16 : LDG_G<B16>; def LD_GLOBAL_NC_i32 : LDG_G<B32>; def LD_GLOBAL_NC_i64 : LDG_G<B64>; @@ -2283,19 +2262,19 @@ def LD_GLOBAL_NC_i64 : LDG_G<B64>; class VLDG_G_ELE_V2<NVPTXRegClass regclass> : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), - "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>; + "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];">; class VLDG_G_ELE_V4<NVPTXRegClass regclass> : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), - "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>; + "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];">; class VLDG_G_ELE_V8<NVPTXRegClass regclass> : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4, regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8), (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), - "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>; + "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];">; // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads. def LD_GLOBAL_NC_v2i16 : VLDG_G_ELE_V2<B16>; @@ -3540,20 +3519,13 @@ multiclass SULD_1D<string inst, NVPTXRegClass outtype> { def _I : SULD_1D_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_1D_I8_CLAMP : SULD_1D<"suld.b.1d.b8.clamp", B16>; -defm SULD_1D_I16_CLAMP : SULD_1D<"suld.b.1d.b16.clamp", B16>; -defm SULD_1D_I32_CLAMP : SULD_1D<"suld.b.1d.b32.clamp", B32>; -defm SULD_1D_I64_CLAMP : SULD_1D<"suld.b.1d.b64.clamp", B64>; - -defm SULD_1D_I8_TRAP : SULD_1D<"suld.b.1d.b8.trap", B16>; -defm SULD_1D_I16_TRAP : SULD_1D<"suld.b.1d.b16.trap", B16>; -defm SULD_1D_I32_TRAP : SULD_1D<"suld.b.1d.b32.trap", B32>; -defm SULD_1D_I64_TRAP : SULD_1D<"suld.b.1d.b64.trap", B64>; - -defm SULD_1D_I8_ZERO : SULD_1D<"suld.b.1d.b8.zero", B16>; -defm SULD_1D_I16_ZERO : SULD_1D<"suld.b.1d.b16.zero", B16>; -defm SULD_1D_I32_ZERO : SULD_1D<"suld.b.1d.b32.zero", B32>; -defm SULD_1D_I64_ZERO : SULD_1D<"suld.b.1d.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_1D_I8_ # op_upper : SULD_1D<"suld.b.1d.b8." # op, B16>; + defm SULD_1D_I16_ # op_upper : SULD_1D<"suld.b.1d.b16." # op, B16>; + defm SULD_1D_I32_ # op_upper : SULD_1D<"suld.b.1d.b32." # op, B32>; + defm SULD_1D_I64_ # op_upper : SULD_1D<"suld.b.1d.b64." # op, B64>; +} class SULD_1D_ARRAY_base<string inst, NVPTXRegClass outtype, dag surf, list<dag> pattern = []> @@ -3570,20 +3542,13 @@ multiclass SULD_1D_ARRAY<string inst, NVPTXRegClass outtype> { def _I : SULD_1D_ARRAY_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_1D_ARRAY_I8_CLAMP : SULD_1D_ARRAY<"suld.b.a1d.b8.clamp", B16>; -defm SULD_1D_ARRAY_I16_CLAMP : SULD_1D_ARRAY<"suld.b.a1d.b16.clamp", B16>; -defm SULD_1D_ARRAY_I32_CLAMP : SULD_1D_ARRAY<"suld.b.a1d.b32.clamp", B32>; -defm SULD_1D_ARRAY_I64_CLAMP : SULD_1D_ARRAY<"suld.b.a1d.b64.clamp", B64>; - -defm SULD_1D_ARRAY_I8_TRAP : SULD_1D_ARRAY<"suld.b.a1d.b8.trap", B16>; -defm SULD_1D_ARRAY_I16_TRAP : SULD_1D_ARRAY<"suld.b.a1d.b16.trap", B16>; -defm SULD_1D_ARRAY_I32_TRAP : SULD_1D_ARRAY<"suld.b.a1d.b32.trap", B32>; -defm SULD_1D_ARRAY_I64_TRAP : SULD_1D_ARRAY<"suld.b.a1d.b64.trap", B64>; - -defm SULD_1D_ARRAY_I8_ZERO : SULD_1D_ARRAY<"suld.b.a1d.b8.zero", B16>; -defm SULD_1D_ARRAY_I16_ZERO : SULD_1D_ARRAY<"suld.b.a1d.b16.zero", B16>; -defm SULD_1D_ARRAY_I32_ZERO : SULD_1D_ARRAY<"suld.b.a1d.b32.zero", B32>; -defm SULD_1D_ARRAY_I64_ZERO : SULD_1D_ARRAY<"suld.b.a1d.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_1D_ARRAY_I8_ # op_upper : SULD_1D_ARRAY<"suld.b.a1d.b8." # op, B16>; + defm SULD_1D_ARRAY_I16_ # op_upper : SULD_1D_ARRAY<"suld.b.a1d.b16." # op, B16>; + defm SULD_1D_ARRAY_I32_ # op_upper : SULD_1D_ARRAY<"suld.b.a1d.b32." # op, B32>; + defm SULD_1D_ARRAY_I64_ # op_upper : SULD_1D_ARRAY<"suld.b.a1d.b64." # op, B64>; +} class SULD_2D_base<string inst, NVPTXRegClass outtype, dag surf, list<dag> pattern = []> @@ -3599,20 +3564,13 @@ multiclass SULD_2D<string inst, NVPTXRegClass outtype> { def _I : SULD_2D_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_2D_I8_CLAMP : SULD_2D<"suld.b.2d.b8.clamp", B16>; -defm SULD_2D_I16_CLAMP : SULD_2D<"suld.b.2d.b16.clamp", B16>; -defm SULD_2D_I32_CLAMP : SULD_2D<"suld.b.2d.b32.clamp", B32>; -defm SULD_2D_I64_CLAMP : SULD_2D<"suld.b.2d.b64.clamp", B64>; - -defm SULD_2D_I8_TRAP : SULD_2D<"suld.b.2d.b8.trap", B16>; -defm SULD_2D_I16_TRAP : SULD_2D<"suld.b.2d.b16.trap", B16>; -defm SULD_2D_I32_TRAP : SULD_2D<"suld.b.2d.b32.trap", B32>; -defm SULD_2D_I64_TRAP : SULD_2D<"suld.b.2d.b64.trap", B64>; - -defm SULD_2D_I8_ZERO : SULD_2D<"suld.b.2d.b8.zero", B16>; -defm SULD_2D_I16_ZERO : SULD_2D<"suld.b.2d.b16.zero", B16>; -defm SULD_2D_I32_ZERO : SULD_2D<"suld.b.2d.b32.zero", B32>; -defm SULD_2D_I64_ZERO : SULD_2D<"suld.b.2d.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_2D_I8_ # op_upper : SULD_2D<"suld.b.2d.b8." # op, B16>; + defm SULD_2D_I16_ # op_upper : SULD_2D<"suld.b.2d.b16." # op, B16>; + defm SULD_2D_I32_ # op_upper : SULD_2D<"suld.b.2d.b32." # op, B32>; + defm SULD_2D_I64_ # op_upper : SULD_2D<"suld.b.2d.b64." # op, B64>; +} class SULD_2D_ARRAY_base<string inst, NVPTXRegClass outtype, dag surf, list<dag> pattern = []> @@ -3629,20 +3587,13 @@ multiclass SULD_2D_ARRAY<string inst, NVPTXRegClass outtype> { def _I : SULD_2D_ARRAY_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_2D_ARRAY_I8_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b8.clamp", B16>; -defm SULD_2D_ARRAY_I16_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b16.clamp", B16>; -defm SULD_2D_ARRAY_I32_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b32.clamp", B32>; -defm SULD_2D_ARRAY_I64_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b64.clamp", B64>; - -defm SULD_2D_ARRAY_I8_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b8.trap", B16>; -defm SULD_2D_ARRAY_I16_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b16.trap", B16>; -defm SULD_2D_ARRAY_I32_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b32.trap", B32>; -defm SULD_2D_ARRAY_I64_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b64.trap", B64>; - -defm SULD_2D_ARRAY_I8_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b8.zero", B16>; -defm SULD_2D_ARRAY_I16_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b16.zero", B16>; -defm SULD_2D_ARRAY_I32_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b32.zero", B32>; -defm SULD_2D_ARRAY_I64_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_2D_ARRAY_I8_ # op_upper : SULD_2D_ARRAY<"suld.b.a2d.b8." # op, B16>; + defm SULD_2D_ARRAY_I16_ # op_upper : SULD_2D_ARRAY<"suld.b.a2d.b16." # op, B16>; + defm SULD_2D_ARRAY_I32_ # op_upper : SULD_2D_ARRAY<"suld.b.a2d.b32." # op, B32>; + defm SULD_2D_ARRAY_I64_ # op_upper : SULD_2D_ARRAY<"suld.b.a2d.b64." # op, B64>; +} class SULD_3D_base<string inst, NVPTXRegClass outtype, dag surf, list<dag> pattern = []> @@ -3659,20 +3610,13 @@ multiclass SULD_3D<string inst, NVPTXRegClass outtype> { def _I : SULD_3D_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_3D_I8_CLAMP : SULD_3D<"suld.b.3d.b8.clamp", B16>; -defm SULD_3D_I16_CLAMP : SULD_3D<"suld.b.3d.b16.clamp", B16>; -defm SULD_3D_I32_CLAMP : SULD_3D<"suld.b.3d.b32.clamp", B32>; -defm SULD_3D_I64_CLAMP : SULD_3D<"suld.b.3d.b64.clamp", B64>; - -defm SULD_3D_I8_TRAP : SULD_3D<"suld.b.3d.b8.trap", B16>; -defm SULD_3D_I16_TRAP : SULD_3D<"suld.b.3d.b16.trap", B16>; -defm SULD_3D_I32_TRAP : SULD_3D<"suld.b.3d.b32.trap", B32>; -defm SULD_3D_I64_TRAP : SULD_3D<"suld.b.3d.b64.trap", B64>; - -defm SULD_3D_I8_ZERO : SULD_3D<"suld.b.3d.b8.zero", B16>; -defm SULD_3D_I16_ZERO : SULD_3D<"suld.b.3d.b16.zero", B16>; -defm SULD_3D_I32_ZERO : SULD_3D<"suld.b.3d.b32.zero", B32>; -defm SULD_3D_I64_ZERO : SULD_3D<"suld.b.3d.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_3D_I8_ # op_upper : SULD_3D<"suld.b.3d.b8." # op, B16>; + defm SULD_3D_I16_ # op_upper : SULD_3D<"suld.b.3d.b16." # op, B16>; + defm SULD_3D_I32_ # op_upper : SULD_3D<"suld.b.3d.b32." # op, B32>; + defm SULD_3D_I64_ # op_upper : SULD_3D<"suld.b.3d.b64." # op, B64>; +} } let IsSuld = 2 in { @@ -3692,20 +3636,13 @@ multiclass SULD_1D_V2<string inst, NVPTXRegClass outtype> { def _I : SULD_1D_V2_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_1D_V2I8_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b8.clamp", B16>; -defm SULD_1D_V2I16_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b16.clamp", B16>; -defm SULD_1D_V2I32_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b32.clamp", B32>; -defm SULD_1D_V2I64_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b64.clamp", B64>; - -defm SULD_1D_V2I8_TRAP : SULD_1D_V2<"suld.b.1d.v2.b8.trap", B16>; -defm SULD_1D_V2I16_TRAP : SULD_1D_V2<"suld.b.1d.v2.b16.trap", B16>; -defm SULD_1D_V2I32_TRAP : SULD_1D_V2<"suld.b.1d.v2.b32.trap", B32>; -defm SULD_1D_V2I64_TRAP : SULD_1D_V2<"suld.b.1d.v2.b64.trap", B64>; - -defm SULD_1D_V2I8_ZERO : SULD_1D_V2<"suld.b.1d.v2.b8.zero", B16>; -defm SULD_1D_V2I16_ZERO : SULD_1D_V2<"suld.b.1d.v2.b16.zero", B16>; -defm SULD_1D_V2I32_ZERO : SULD_1D_V2<"suld.b.1d.v2.b32.zero", B32>; -defm SULD_1D_V2I64_ZERO : SULD_1D_V2<"suld.b.1d.v2.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_1D_V2I8_ # op_upper : SULD_1D_V2<"suld.b.1d.v2.b8." # op, B16>; + defm SULD_1D_V2I16_ # op_upper : SULD_1D_V2<"suld.b.1d.v2.b16." # op, B16>; + defm SULD_1D_V2I32_ # op_upper : SULD_1D_V2<"suld.b.1d.v2.b32." # op, B32>; + defm SULD_1D_V2I64_ # op_upper : SULD_1D_V2<"suld.b.1d.v2.b64." # op, B64>; +} class SULD_1D_ARRAY_V2_base<string inst, NVPTXRegClass outtype, dag surf, list<dag> pattern = []> @@ -3722,20 +3659,13 @@ multiclass SULD_1D_ARRAY_V2<string inst, NVPTXRegClass outtype> { def _I : SULD_1D_ARRAY_V2_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_1D_ARRAY_V2I8_CLAMP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.clamp", B16>; -defm SULD_1D_ARRAY_V2I16_CLAMP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.clamp", B16>; -defm SULD_1D_ARRAY_V2I32_CLAMP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.clamp", B32>; -defm SULD_1D_ARRAY_V2I64_CLAMP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.clamp", B64>; - -defm SULD_1D_ARRAY_V2I8_TRAP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.trap", B16>; -defm SULD_1D_ARRAY_V2I16_TRAP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.trap", B16>; -defm SULD_1D_ARRAY_V2I32_TRAP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.trap", B32>; -defm SULD_1D_ARRAY_V2I64_TRAP : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.trap", B64>; - -defm SULD_1D_ARRAY_V2I8_ZERO : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.zero", B16>; -defm SULD_1D_ARRAY_V2I16_ZERO : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.zero", B16>; -defm SULD_1D_ARRAY_V2I32_ZERO : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.zero", B32>; -defm SULD_1D_ARRAY_V2I64_ZERO : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_1D_ARRAY_V2I8_ # op_upper : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8." # op, B16>; + defm SULD_1D_ARRAY_V2I16_ # op_upper : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16." # op, B16>; + defm SULD_1D_ARRAY_V2I32_ # op_upper : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32." # op, B32>; + defm SULD_1D_ARRAY_V2I64_ # op_upper : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64." # op, B64>; +} class SULD_2D_V2_base<string inst, NVPTXRegClass outtype, dag surf, list<dag> pattern = []> @@ -3752,20 +3682,13 @@ multiclass SULD_2D_V2<string inst, NVPTXRegClass outtype> { def _I : SULD_2D_V2_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_2D_V2I8_CLAMP : SULD_2D_V2<"suld.b.2d.v2.b8.clamp", B16>; -defm SULD_2D_V2I16_CLAMP : SULD_2D_V2<"suld.b.2d.v2.b16.clamp", B16>; -defm SULD_2D_V2I32_CLAMP : SULD_2D_V2<"suld.b.2d.v2.b32.clamp", B32>; -defm SULD_2D_V2I64_CLAMP : SULD_2D_V2<"suld.b.2d.v2.b64.clamp", B64>; - -defm SULD_2D_V2I8_TRAP : SULD_2D_V2<"suld.b.2d.v2.b8.trap", B16>; -defm SULD_2D_V2I16_TRAP : SULD_2D_V2<"suld.b.2d.v2.b16.trap", B16>; -defm SULD_2D_V2I32_TRAP : SULD_2D_V2<"suld.b.2d.v2.b32.trap", B32>; -defm SULD_2D_V2I64_TRAP : SULD_2D_V2<"suld.b.2d.v2.b64.trap", B64>; - -defm SULD_2D_V2I8_ZERO : SULD_2D_V2<"suld.b.2d.v2.b8.zero", B16>; -defm SULD_2D_V2I16_ZERO : SULD_2D_V2<"suld.b.2d.v2.b16.zero", B16>; -defm SULD_2D_V2I32_ZERO : SULD_2D_V2<"suld.b.2d.v2.b32.zero", B32>; -defm SULD_2D_V2I64_ZERO : SULD_2D_V2<"suld.b.2d.v2.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_2D_V2I8_ # op_upper : SULD_2D_V2<"suld.b.2d.v2.b8." # op, B16>; + defm SULD_2D_V2I16_ # op_upper : SULD_2D_V2<"suld.b.2d.v2.b16." # op, B16>; + defm SULD_2D_V2I32_ # op_upper : SULD_2D_V2<"suld.b.2d.v2.b32." # op, B32>; + defm SULD_2D_V2I64_ # op_upper : SULD_2D_V2<"suld.b.2d.v2.b64." # op, B64>; +} class SULD_2D_ARRAY_V2_base<string inst, NVPTXRegClass outtype, dag surf, list<dag> pattern = []> @@ -3782,20 +3705,13 @@ multiclass SULD_2D_ARRAY_V2<string inst, NVPTXRegClass outtype> { def _I : SULD_2D_ARRAY_V2_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_2D_ARRAY_V2I8_CLAMP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.clamp", B16>; -defm SULD_2D_ARRAY_V2I16_CLAMP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.clamp", B16>; -defm SULD_2D_ARRAY_V2I32_CLAMP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.clamp", B32>; -defm SULD_2D_ARRAY_V2I64_CLAMP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.clamp", B64>; - -defm SULD_2D_ARRAY_V2I8_TRAP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.trap", B16>; -defm SULD_2D_ARRAY_V2I16_TRAP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.trap", B16>; -defm SULD_2D_ARRAY_V2I32_TRAP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.trap", B32>; -defm SULD_2D_ARRAY_V2I64_TRAP : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.trap", B64>; - -defm SULD_2D_ARRAY_V2I8_ZERO : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.zero", B16>; -defm SULD_2D_ARRAY_V2I16_ZERO : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.zero", B16>; -defm SULD_2D_ARRAY_V2I32_ZERO : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.zero", B32>; -defm SULD_2D_ARRAY_V2I64_ZERO : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_2D_ARRAY_V2I8_ # op_upper : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8." # op, B16>; + defm SULD_2D_ARRAY_V2I16_ # op_upper : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16." # op, B16>; + defm SULD_2D_ARRAY_V2I32_ # op_upper : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32." # op, B32>; + defm SULD_2D_ARRAY_V2I64_ # op_upper : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64." # op, B64>; +} class SULD_3D_V2_base<string inst, NVPTXRegClass outtype, dag surf, list<dag> pattern = []> @@ -3812,20 +3728,13 @@ multiclass SULD_3D_V2<string inst, NVPTXRegClass outtype> { def _I : SULD_3D_V2_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_3D_V2I8_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b8.clamp", B16>; -defm SULD_3D_V2I16_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b16.clamp", B16>; -defm SULD_3D_V2I32_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b32.clamp", B32>; -defm SULD_3D_V2I64_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b64.clamp", B64>; - -defm SULD_3D_V2I8_TRAP : SULD_3D_V2<"suld.b.3d.v2.b8.trap", B16>; -defm SULD_3D_V2I16_TRAP : SULD_3D_V2<"suld.b.3d.v2.b16.trap", B16>; -defm SULD_3D_V2I32_TRAP : SULD_3D_V2<"suld.b.3d.v2.b32.trap", B32>; -defm SULD_3D_V2I64_TRAP : SULD_3D_V2<"suld.b.3d.v2.b64.trap", B64>; - -defm SULD_3D_V2I8_ZERO : SULD_3D_V2<"suld.b.3d.v2.b8.zero", B16>; -defm SULD_3D_V2I16_ZERO : SULD_3D_V2<"suld.b.3d.v2.b16.zero", B16>; -defm SULD_3D_V2I32_ZERO : SULD_3D_V2<"suld.b.3d.v2.b32.zero", B32>; -defm SULD_3D_V2I64_ZERO : SULD_3D_V2<"suld.b.3d.v2.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_3D_V2I8_ # op_upper : SULD_3D_V2<"suld.b.3d.v2.b8." # op, B16>; + defm SULD_3D_V2I16_ # op_upper : SULD_3D_V2<"suld.b.3d.v2.b16." # op, B16>; + defm SULD_3D_V2I32_ # op_upper : SULD_3D_V2<"suld.b.3d.v2.b32." # op, B32>; + defm SULD_3D_V2I64_ # op_upper : SULD_3D_V2<"suld.b.3d.v2.b64." # op, B64>; +} } @@ -3846,17 +3755,12 @@ multiclass SULD_1D_V4<string inst, NVPTXRegClass outtype> { def _I : SULD_1D_V4_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_1D_V4I8_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b8.clamp", B16>; -defm SULD_1D_V4I16_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b16.clamp", B16>; -defm SULD_1D_V4I32_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b32.clamp", B32>; - -defm SULD_1D_V4I8_TRAP : SULD_1D_V4<"suld.b.1d.v4.b8.trap", B16>; -defm SULD_1D_V4I16_TRAP : SULD_1D_V4<"suld.b.1d.v4.b16.trap", B16>; -defm SULD_1D_V4I32_TRAP : SULD_1D_V4<"suld.b.1d.v4.b32.trap", B32>; - -defm SULD_1D_V4I8_ZERO : SULD_1D_V4<"suld.b.1d.v4.b8.zero", B16>; -defm SULD_1D_V4I16_ZERO : SULD_1D_V4<"suld.b.1d.v4.b16.zero", B16>; -defm SULD_1D_V4I32_ZERO : SULD_1D_V4<"suld.b.1d.v4.b32.zero", B32>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_1D_V4I8_ # op_upper : SULD_1D_V4<"suld.b.1d.v4.b8." # op, B16>; + defm SULD_1D_V4I16_ # op_upper : SULD_1D_V4<"suld.b.1d.v4.b16." # op, B16>; + defm SULD_1D_V4I32_ # op_upper : SULD_1D_V4<"suld.b.1d.v4.b32." # op, B32>; +} class SULD_1D_ARRAY_V4_base<string inst, NVPTXRegClass outtype, dag surf, list<dag> pattern = []> @@ -3874,17 +3778,12 @@ multiclass SULD_1D_ARRAY_V4<string inst, NVPTXRegClass outtype> { def _I : SULD_1D_ARRAY_V4_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_1D_ARRAY_V4I8_CLAMP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.clamp", B16>; -defm SULD_1D_ARRAY_V4I16_CLAMP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.clamp", B16>; -defm SULD_1D_ARRAY_V4I32_CLAMP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.clamp", B32>; - -defm SULD_1D_ARRAY_V4I8_TRAP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.trap", B16>; -defm SULD_1D_ARRAY_V4I16_TRAP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.trap", B16>; -defm SULD_1D_ARRAY_V4I32_TRAP : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.trap", B32>; - -defm SULD_1D_ARRAY_V4I8_ZERO : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.zero", B16>; -defm SULD_1D_ARRAY_V4I16_ZERO : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.zero", B16>; -defm SULD_1D_ARRAY_V4I32_ZERO : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.zero", B32>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_1D_ARRAY_V4I8_ # op_upper : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8." # op, B16>; + defm SULD_1D_ARRAY_V4I16_ # op_upper : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16." # op, B16>; + defm SULD_1D_ARRAY_V4I32_ # op_upper : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32." # op, B32>; +} class SULD_2D_V4_base<string inst, NVPTXRegClass outtype, dag surf, list<dag> pattern = []> @@ -3901,17 +3800,12 @@ multiclass SULD_2D_V4<string inst, NVPTXRegClass outtype> { def _I : SULD_2D_V4_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_2D_V4I8_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b8.clamp", B16>; -defm SULD_2D_V4I16_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b16.clamp", B16>; -defm SULD_2D_V4I32_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b32.clamp", B32>; - -defm SULD_2D_V4I8_TRAP : SULD_2D_V4<"suld.b.2d.v4.b8.trap", B16>; -defm SULD_2D_V4I16_TRAP : SULD_2D_V4<"suld.b.2d.v4.b16.trap", B16>; -defm SULD_2D_V4I32_TRAP : SULD_2D_V4<"suld.b.2d.v4.b32.trap", B32>; - -defm SULD_2D_V4I8_ZERO : SULD_2D_V4<"suld.b.2d.v4.b8.zero", B16>; -defm SULD_2D_V4I16_ZERO : SULD_2D_V4<"suld.b.2d.v4.b16.zero", B16>; -defm SULD_2D_V4I32_ZERO : SULD_2D_V4<"suld.b.2d.v4.b32.zero", B32>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_2D_V4I8_ # op_upper : SULD_2D_V4<"suld.b.2d.v4.b8." # op, B16>; + defm SULD_2D_V4I16_ # op_upper : SULD_2D_V4<"suld.b.2d.v4.b16." # op, B16>; + defm SULD_2D_V4I32_ # op_upper : SULD_2D_V4<"suld.b.2d.v4.b32." # op, B32>; +} class SULD_2D_ARRAY_V4_base<string inst, NVPTXRegClass outtype, dag surf, list<dag> pattern = []> @@ -3929,17 +3823,12 @@ multiclass SULD_2D_ARRAY_V4<string inst, NVPTXRegClass outtype> { def _I : SULD_2D_ARRAY_V4_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_2D_ARRAY_V4I8_CLAMP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.clamp", B16>; -defm SULD_2D_ARRAY_V4I16_CLAMP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.clamp", B16>; -defm SULD_2D_ARRAY_V4I32_CLAMP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.clamp", B32>; - -defm SULD_2D_ARRAY_V4I8_TRAP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.trap", B16>; -defm SULD_2D_ARRAY_V4I16_TRAP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.trap", B16>; -defm SULD_2D_ARRAY_V4I32_TRAP : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.trap", B32>; - -defm SULD_2D_ARRAY_V4I8_ZERO : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.zero", B16>; -defm SULD_2D_ARRAY_V4I16_ZERO : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.zero", B16>; -defm SULD_2D_ARRAY_V4I32_ZERO : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.zero", B32>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_2D_ARRAY_V4I8_ # op_upper : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8." # op, B16>; + defm SULD_2D_ARRAY_V4I16_ # op_upper : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16." # op, B16>; + defm SULD_2D_ARRAY_V4I32_ # op_upper : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32." # op, B32>; +} class SULD_3D_V4_base<string inst, NVPTXRegClass outtype, dag surf, list<dag> pattern = []> @@ -3956,17 +3845,12 @@ multiclass SULD_3D_V4<string inst, NVPTXRegClass outtype> { def _I : SULD_3D_V4_base<inst, outtype, (ins i64imm:$s)>; } -defm SULD_3D_V4I8_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b8.clamp", B16>; -defm SULD_3D_V4I16_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b16.clamp", B16>; -defm SULD_3D_V4I32_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b32.clamp", B32>; - -defm SULD_3D_V4I8_TRAP : SULD_3D_V4<"suld.b.3d.v4.b8.trap", B16>; -defm SULD_3D_V4I16_TRAP : SULD_3D_V4<"suld.b.3d.v4.b16.trap", B16>; -defm SULD_3D_V4I32_TRAP : SULD_3D_V4<"suld.b.3d.v4.b32.trap", B32>; - -defm SULD_3D_V4I8_ZERO : SULD_3D_V4<"suld.b.3d.v4.b8.zero", B16>; -defm SULD_3D_V4I16_ZERO : SULD_3D_V4<"suld.b.3d.v4.b16.zero", B16>; -defm SULD_3D_V4I32_ZERO : SULD_3D_V4<"suld.b.3d.v4.b32.zero", B32>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SULD_3D_V4I8_ # op_upper : SULD_3D_V4<"suld.b.3d.v4.b8." # op, B16>; + defm SULD_3D_V4I16_ # op_upper : SULD_3D_V4<"suld.b.3d.v4.b16." # op, B16>; + defm SULD_3D_V4I32_ # op_upper : SULD_3D_V4<"suld.b.3d.v4.b32." # op, B32>; +} } @@ -4037,20 +3921,13 @@ multiclass SUST_1D<string inst, NVPTXRegClass intype> { def _I : SUST_1D_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_1D_I8_CLAMP : SUST_1D<"sust.b.1d.b8.clamp", B16>; -defm SUST_B_1D_I16_CLAMP : SUST_1D<"sust.b.1d.b16.clamp", B16>; -defm SUST_B_1D_I32_CLAMP : SUST_1D<"sust.b.1d.b32.clamp", B32>; -defm SUST_B_1D_I64_CLAMP : SUST_1D<"sust.b.1d.b64.clamp", B64>; - -defm SUST_B_1D_I8_TRAP : SUST_1D<"sust.b.1d.b8.trap", B16>; -defm SUST_B_1D_I16_TRAP : SUST_1D<"sust.b.1d.b16.trap", B16>; -defm SUST_B_1D_I32_TRAP : SUST_1D<"sust.b.1d.b32.trap", B32>; -defm SUST_B_1D_I64_TRAP : SUST_1D<"sust.b.1d.b64.trap", B64>; - -defm SUST_B_1D_I8_ZERO : SUST_1D<"sust.b.1d.b8.zero", B16>; -defm SUST_B_1D_I16_ZERO : SUST_1D<"sust.b.1d.b16.zero", B16>; -defm SUST_B_1D_I32_ZERO : SUST_1D<"sust.b.1d.b32.zero", B32>; -defm SUST_B_1D_I64_ZERO : SUST_1D<"sust.b.1d.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_1D_I8_ # op_upper : SUST_1D<"sust.b.1d.b8." # op, B16>; + defm SUST_B_1D_I16_ # op_upper : SUST_1D<"sust.b.1d.b16." # op, B16>; + defm SUST_B_1D_I32_ # op_upper : SUST_1D<"sust.b.1d.b32." # op, B32>; + defm SUST_B_1D_I64_ # op_upper : SUST_1D<"sust.b.1d.b64." # op, B64>; +} defm SUST_P_1D_I8_TRAP : SUST_1D<"sust.p.1d.b8.trap", B16>; defm SUST_P_1D_I16_TRAP : SUST_1D<"sust.p.1d.b16.trap", B16>; @@ -4068,23 +3945,13 @@ multiclass SUST_1D_V2<string inst, NVPTXRegClass intype> { def _I : SUST_1D_V2_base<inst, intype, (ins i64imm:$s), []>; } -// int_nvvm_sust_b_1d_v2i8_clamp - -defm SUST_B_1D_V2I8_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b8.clamp", B16>; -defm SUST_B_1D_V2I16_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b16.clamp", B16>; -defm SUST_B_1D_V2I32_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b32.clamp", B32>; -defm SUST_B_1D_V2I64_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b64.clamp", B64>; - -defm SUST_B_1D_V2I8_TRAP : SUST_1D_V2<"sust.b.1d.v2.b8.trap", B16>; -defm SUST_B_1D_V2I16_TRAP : SUST_1D_V2<"sust.b.1d.v2.b16.trap", B16>; -defm SUST_B_1D_V2I32_TRAP : SUST_1D_V2<"sust.b.1d.v2.b32.trap", B32>; -defm SUST_B_1D_V2I64_TRAP : SUST_1D_V2<"sust.b.1d.v2.b64.trap", B64>; - -defm SUST_B_1D_V2I8_ZERO : SUST_1D_V2<"sust.b.1d.v2.b8.zero", B16>; -defm SUST_B_1D_V2I16_ZERO : SUST_1D_V2<"sust.b.1d.v2.b16.zero", B16>; -defm SUST_B_1D_V2I32_ZERO : SUST_1D_V2<"sust.b.1d.v2.b32.zero", B32>; -defm SUST_B_1D_V2I64_ZERO : SUST_1D_V2<"sust.b.1d.v2.b64.zero", B64>; - +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_1D_V2I8_ # op_upper : SUST_1D_V2<"sust.b.1d.v2.b8." # op, B16>; + defm SUST_B_1D_V2I16_ # op_upper : SUST_1D_V2<"sust.b.1d.v2.b16." # op, B16>; + defm SUST_B_1D_V2I32_ # op_upper : SUST_1D_V2<"sust.b.1d.v2.b32." # op, B32>; + defm SUST_B_1D_V2I64_ # op_upper : SUST_1D_V2<"sust.b.1d.v2.b64." # op, B64>; +} defm SUST_P_1D_V2I8_TRAP : SUST_1D_V2<"sust.p.1d.v2.b8.trap", B16>; defm SUST_P_1D_V2I16_TRAP : SUST_1D_V2<"sust.p.1d.v2.b16.trap", B16>; defm SUST_P_1D_V2I32_TRAP : SUST_1D_V2<"sust.p.1d.v2.b32.trap", B32>; @@ -4103,17 +3970,12 @@ multiclass SUST_1D_V4<string inst, NVPTXRegClass intype> { def _I : SUST_1D_V4_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_1D_V4I8_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b8.clamp", B16>; -defm SUST_B_1D_V4I16_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b16.clamp", B16>; -defm SUST_B_1D_V4I32_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b32.clamp", B32>; - -defm SUST_B_1D_V4I8_TRAP : SUST_1D_V4<"sust.b.1d.v4.b8.trap", B16>; -defm SUST_B_1D_V4I16_TRAP : SUST_1D_V4<"sust.b.1d.v4.b16.trap", B16>; -defm SUST_B_1D_V4I32_TRAP : SUST_1D_V4<"sust.b.1d.v4.b32.trap", B32>; - -defm SUST_B_1D_V4I8_ZERO : SUST_1D_V4<"sust.b.1d.v4.b8.zero", B16>; -defm SUST_B_1D_V4I16_ZERO : SUST_1D_V4<"sust.b.1d.v4.b16.zero", B16>; -defm SUST_B_1D_V4I32_ZERO : SUST_1D_V4<"sust.b.1d.v4.b32.zero", B32>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_1D_V4I8_ # op_upper : SUST_1D_V4<"sust.b.1d.v4.b8." # op, B16>; + defm SUST_B_1D_V4I16_ # op_upper : SUST_1D_V4<"sust.b.1d.v4.b16." # op, B16>; + defm SUST_B_1D_V4I32_ # op_upper : SUST_1D_V4<"sust.b.1d.v4.b32." # op, B32>; +} defm SUST_P_1D_V4I8_TRAP : SUST_1D_V4<"sust.p.1d.v4.b8.trap", B16>; defm SUST_P_1D_V4I16_TRAP : SUST_1D_V4<"sust.p.1d.v4.b16.trap", B16>; @@ -4131,20 +3993,13 @@ multiclass SUST_1D_ARRAY<string inst, NVPTXRegClass intype> { def _I : SUST_1D_ARRAY_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_1D_ARRAY_I8_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b8.clamp", B16>; -defm SUST_B_1D_ARRAY_I16_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b16.clamp", B16>; -defm SUST_B_1D_ARRAY_I32_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b32.clamp", B32>; -defm SUST_B_1D_ARRAY_I64_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b64.clamp", B64>; - -defm SUST_B_1D_ARRAY_I8_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b8.trap", B16>; -defm SUST_B_1D_ARRAY_I16_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b16.trap", B16>; -defm SUST_B_1D_ARRAY_I32_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b32.trap", B32>; -defm SUST_B_1D_ARRAY_I64_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b64.trap", B64>; - -defm SUST_B_1D_ARRAY_I8_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b8.zero", B16>; -defm SUST_B_1D_ARRAY_I16_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b16.zero", B16>; -defm SUST_B_1D_ARRAY_I32_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b32.zero", B32>; -defm SUST_B_1D_ARRAY_I64_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_1D_ARRAY_I8_ # op_upper : SUST_1D_ARRAY<"sust.b.a1d.b8." # op, B16>; + defm SUST_B_1D_ARRAY_I16_ # op_upper : SUST_1D_ARRAY<"sust.b.a1d.b16." # op, B16>; + defm SUST_B_1D_ARRAY_I32_ # op_upper : SUST_1D_ARRAY<"sust.b.a1d.b32." # op, B32>; + defm SUST_B_1D_ARRAY_I64_ # op_upper : SUST_1D_ARRAY<"sust.b.a1d.b64." # op, B64>; +} defm SUST_P_1D_ARRAY_I8_TRAP : SUST_1D_ARRAY<"sust.p.a1d.b8.trap", B16>; defm SUST_P_1D_ARRAY_I16_TRAP : SUST_1D_ARRAY<"sust.p.a1d.b16.trap", B16>; @@ -4164,20 +4019,13 @@ multiclass SUST_1D_ARRAY_V2<string inst, NVPTXRegClass intype> { def _I : SUST_1D_ARRAY_V2_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_1D_ARRAY_V2I8_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.clamp", B16>; -defm SUST_B_1D_ARRAY_V2I16_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.clamp", B16>; -defm SUST_B_1D_ARRAY_V2I32_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.clamp", B32>; -defm SUST_B_1D_ARRAY_V2I64_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.clamp", B64>; - -defm SUST_B_1D_ARRAY_V2I8_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.trap", B16>; -defm SUST_B_1D_ARRAY_V2I16_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.trap", B16>; -defm SUST_B_1D_ARRAY_V2I32_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.trap", B32>; -defm SUST_B_1D_ARRAY_V2I64_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.trap", B64>; - -defm SUST_B_1D_ARRAY_V2I8_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.zero", B16>; -defm SUST_B_1D_ARRAY_V2I16_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.zero", B16>; -defm SUST_B_1D_ARRAY_V2I32_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.zero", B32>; -defm SUST_B_1D_ARRAY_V2I64_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_1D_ARRAY_V2I8_ # op_upper : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8." # op, B16>; + defm SUST_B_1D_ARRAY_V2I16_ # op_upper : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16." # op, B16>; + defm SUST_B_1D_ARRAY_V2I32_ # op_upper : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32." # op, B32>; + defm SUST_B_1D_ARRAY_V2I64_ # op_upper : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64." # op, B64>; +} defm SUST_P_1D_ARRAY_V2I8_TRAP : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b8.trap", B16>; defm SUST_P_1D_ARRAY_V2I16_TRAP : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b16.trap", B16>; @@ -4197,33 +4045,16 @@ multiclass SUST_1D_ARRAY_V4<string inst, NVPTXRegClass intype> { def _I : SUST_1D_ARRAY_V4_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_1D_ARRAY_V4I8_CLAMP - : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.clamp", B16>; -defm SUST_B_1D_ARRAY_V4I16_CLAMP - : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.clamp", B16>; -defm SUST_B_1D_ARRAY_V4I32_CLAMP - : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.clamp", B32>; - -defm SUST_B_1D_ARRAY_V4I8_TRAP - : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.trap", B16>; -defm SUST_B_1D_ARRAY_V4I16_TRAP - : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.trap", B16>; -defm SUST_B_1D_ARRAY_V4I32_TRAP - : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.trap", B32>; - -defm SUST_B_1D_ARRAY_V4I8_ZERO - : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.zero", B16>; -defm SUST_B_1D_ARRAY_V4I16_ZERO - : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.zero", B16>; -defm SUST_B_1D_ARRAY_V4I32_ZERO - : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.zero", B32>; - -defm SUST_P_1D_ARRAY_V4I8_TRAP - : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b8.trap", B16>; -defm SUST_P_1D_ARRAY_V4I16_TRAP - : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b16.trap", B16>; -defm SUST_P_1D_ARRAY_V4I32_TRAP - : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b32.trap", B32>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_1D_ARRAY_V4I8_ # op_upper : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8." # op, B16>; + defm SUST_B_1D_ARRAY_V4I16_ # op_upper : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16." # op, B16>; + defm SUST_B_1D_ARRAY_V4I32_ # op_upper : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32." # op, B32>; +} + +defm SUST_P_1D_ARRAY_V4I8_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b8.trap", B16>; +defm SUST_P_1D_ARRAY_V4I16_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b16.trap", B16>; +defm SUST_P_1D_ARRAY_V4I32_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b32.trap", B32>; class SUST_2D_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat> : NVPTXInst<(outs), @@ -4237,20 +4068,13 @@ multiclass SUST_2D<string inst, NVPTXRegClass intype> { def _I : SUST_2D_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_2D_I8_CLAMP : SUST_2D<"sust.b.2d.b8.clamp", B16>; -defm SUST_B_2D_I16_CLAMP : SUST_2D<"sust.b.2d.b16.clamp", B16>; -defm SUST_B_2D_I32_CLAMP : SUST_2D<"sust.b.2d.b32.clamp", B32>; -defm SUST_B_2D_I64_CLAMP : SUST_2D<"sust.b.2d.b64.clamp", B64>; - -defm SUST_B_2D_I8_TRAP : SUST_2D<"sust.b.2d.b8.trap", B16>; -defm SUST_B_2D_I16_TRAP : SUST_2D<"sust.b.2d.b16.trap", B16>; -defm SUST_B_2D_I32_TRAP : SUST_2D<"sust.b.2d.b32.trap", B32>; -defm SUST_B_2D_I64_TRAP : SUST_2D<"sust.b.2d.b64.trap", B64>; - -defm SUST_B_2D_I8_ZERO : SUST_2D<"sust.b.2d.b8.zero", B16>; -defm SUST_B_2D_I16_ZERO : SUST_2D<"sust.b.2d.b16.zero", B16>; -defm SUST_B_2D_I32_ZERO : SUST_2D<"sust.b.2d.b32.zero", B32>; -defm SUST_B_2D_I64_ZERO : SUST_2D<"sust.b.2d.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_2D_I8_ # op_upper : SUST_2D<"sust.b.2d.b8." # op, B16>; + defm SUST_B_2D_I16_ # op_upper : SUST_2D<"sust.b.2d.b16." # op, B16>; + defm SUST_B_2D_I32_ # op_upper : SUST_2D<"sust.b.2d.b32." # op, B32>; + defm SUST_B_2D_I64_ # op_upper : SUST_2D<"sust.b.2d.b64." # op, B64>; +} defm SUST_P_2D_I8_TRAP : SUST_2D<"sust.p.2d.b8.trap", B16>; defm SUST_P_2D_I16_TRAP : SUST_2D<"sust.p.2d.b16.trap", B16>; @@ -4270,20 +4094,13 @@ multiclass SUST_2D_V2<string inst, NVPTXRegClass intype> { def _I : SUST_2D_V2_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_2D_V2I8_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b8.clamp", B16>; -defm SUST_B_2D_V2I16_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b16.clamp", B16>; -defm SUST_B_2D_V2I32_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b32.clamp", B32>; -defm SUST_B_2D_V2I64_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b64.clamp", B64>; - -defm SUST_B_2D_V2I8_TRAP : SUST_2D_V2<"sust.b.2d.v2.b8.trap", B16>; -defm SUST_B_2D_V2I16_TRAP : SUST_2D_V2<"sust.b.2d.v2.b16.trap", B16>; -defm SUST_B_2D_V2I32_TRAP : SUST_2D_V2<"sust.b.2d.v2.b32.trap", B32>; -defm SUST_B_2D_V2I64_TRAP : SUST_2D_V2<"sust.b.2d.v2.b64.trap", B64>; - -defm SUST_B_2D_V2I8_ZERO : SUST_2D_V2<"sust.b.2d.v2.b8.zero", B16>; -defm SUST_B_2D_V2I16_ZERO : SUST_2D_V2<"sust.b.2d.v2.b16.zero", B16>; -defm SUST_B_2D_V2I32_ZERO : SUST_2D_V2<"sust.b.2d.v2.b32.zero", B32>; -defm SUST_B_2D_V2I64_ZERO : SUST_2D_V2<"sust.b.2d.v2.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_2D_V2I8_ # op_upper : SUST_2D_V2<"sust.b.2d.v2.b8." # op, B16>; + defm SUST_B_2D_V2I16_ # op_upper : SUST_2D_V2<"sust.b.2d.v2.b16." # op, B16>; + defm SUST_B_2D_V2I32_ # op_upper : SUST_2D_V2<"sust.b.2d.v2.b32." # op, B32>; + defm SUST_B_2D_V2I64_ # op_upper : SUST_2D_V2<"sust.b.2d.v2.b64." # op, B64>; +} defm SUST_P_2D_V2I8_TRAP : SUST_2D_V2<"sust.p.2d.v2.b8.trap", B16>; defm SUST_P_2D_V2I16_TRAP : SUST_2D_V2<"sust.p.2d.v2.b16.trap", B16>; @@ -4303,17 +4120,12 @@ multiclass SUST_2D_V4<string inst, NVPTXRegClass intype> { def _I : SUST_2D_V4_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_2D_V4I8_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b8.clamp", B16>; -defm SUST_B_2D_V4I16_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b16.clamp", B16>; -defm SUST_B_2D_V4I32_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b32.clamp", B32>; - -defm SUST_B_2D_V4I8_TRAP : SUST_2D_V4<"sust.b.2d.v4.b8.trap", B16>; -defm SUST_B_2D_V4I16_TRAP : SUST_2D_V4<"sust.b.2d.v4.b16.trap", B16>; -defm SUST_B_2D_V4I32_TRAP : SUST_2D_V4<"sust.b.2d.v4.b32.trap", B32>; - -defm SUST_B_2D_V4I8_ZERO : SUST_2D_V4<"sust.b.2d.v4.b8.zero", B16>; -defm SUST_B_2D_V4I16_ZERO : SUST_2D_V4<"sust.b.2d.v4.b16.zero", B16>; -defm SUST_B_2D_V4I32_ZERO : SUST_2D_V4<"sust.b.2d.v4.b32.zero", B32>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_2D_V4I8_ # op_upper : SUST_2D_V4<"sust.b.2d.v4.b8." # op, B16>; + defm SUST_B_2D_V4I16_ # op_upper : SUST_2D_V4<"sust.b.2d.v4.b16." # op, B16>; + defm SUST_B_2D_V4I32_ # op_upper : SUST_2D_V4<"sust.b.2d.v4.b32." # op, B32>; +} defm SUST_P_2D_V4I8_TRAP : SUST_2D_V4<"sust.p.2d.v4.b8.trap", B16>; defm SUST_P_2D_V4I16_TRAP : SUST_2D_V4<"sust.p.2d.v4.b16.trap", B16>; @@ -4333,20 +4145,13 @@ multiclass SUST_2D_ARRAY<string inst, NVPTXRegClass intype> { def _I : SUST_2D_ARRAY_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_2D_ARRAY_I8_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b8.clamp", B16>; -defm SUST_B_2D_ARRAY_I16_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b16.clamp", B16>; -defm SUST_B_2D_ARRAY_I32_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b32.clamp", B32>; -defm SUST_B_2D_ARRAY_I64_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b64.clamp", B64>; - -defm SUST_B_2D_ARRAY_I8_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b8.trap", B16>; -defm SUST_B_2D_ARRAY_I16_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b16.trap", B16>; -defm SUST_B_2D_ARRAY_I32_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b32.trap", B32>; -defm SUST_B_2D_ARRAY_I64_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b64.trap", B64>; - -defm SUST_B_2D_ARRAY_I8_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b8.zero", B16>; -defm SUST_B_2D_ARRAY_I16_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b16.zero", B16>; -defm SUST_B_2D_ARRAY_I32_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b32.zero", B32>; -defm SUST_B_2D_ARRAY_I64_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_2D_ARRAY_I8_ # op_upper : SUST_2D_ARRAY<"sust.b.a2d.b8." # op, B16>; + defm SUST_B_2D_ARRAY_I16_ # op_upper : SUST_2D_ARRAY<"sust.b.a2d.b16." # op, B16>; + defm SUST_B_2D_ARRAY_I32_ # op_upper : SUST_2D_ARRAY<"sust.b.a2d.b32." # op, B32>; + defm SUST_B_2D_ARRAY_I64_ # op_upper : SUST_2D_ARRAY<"sust.b.a2d.b64." # op, B64>; +} defm SUST_P_2D_ARRAY_I8_TRAP : SUST_2D_ARRAY<"sust.p.a2d.b8.trap", B16>; defm SUST_P_2D_ARRAY_I16_TRAP : SUST_2D_ARRAY<"sust.p.a2d.b16.trap", B16>; @@ -4366,20 +4171,13 @@ multiclass SUST_2D_ARRAY_V2<string inst, NVPTXRegClass intype> { def _I : SUST_2D_ARRAY_V2_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_2D_ARRAY_V2I8_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.clamp", B16>; -defm SUST_B_2D_ARRAY_V2I16_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.clamp", B16>; -defm SUST_B_2D_ARRAY_V2I32_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.clamp", B32>; -defm SUST_B_2D_ARRAY_V2I64_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.clamp", B64>; - -defm SUST_B_2D_ARRAY_V2I8_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.trap", B16>; -defm SUST_B_2D_ARRAY_V2I16_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.trap", B16>; -defm SUST_B_2D_ARRAY_V2I32_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.trap", B32>; -defm SUST_B_2D_ARRAY_V2I64_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.trap", B64>; - -defm SUST_B_2D_ARRAY_V2I8_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.zero", B16>; -defm SUST_B_2D_ARRAY_V2I16_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.zero", B16>; -defm SUST_B_2D_ARRAY_V2I32_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.zero", B32>; -defm SUST_B_2D_ARRAY_V2I64_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.zero", B64>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_2D_ARRAY_V2I8_ # op_upper : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8." # op, B16>; + defm SUST_B_2D_ARRAY_V2I16_ # op_upper : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16." # op, B16>; + defm SUST_B_2D_ARRAY_V2I32_ # op_upper : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32." # op, B32>; + defm SUST_B_2D_ARRAY_V2I64_ # op_upper : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64." # op, B64>; +} defm SUST_P_2D_ARRAY_V2I8_TRAP : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b8.trap", B16>; defm SUST_P_2D_ARRAY_V2I16_TRAP : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b16.trap", B16>; @@ -4399,17 +4197,12 @@ multiclass SUST_2D_ARRAY_V4<string inst, NVPTXRegClass intype> { def _I : SUST_2D_ARRAY_V4_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_2D_ARRAY_V4I8_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.clamp", B16>; -defm SUST_B_2D_ARRAY_V4I16_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.clamp", B16>; -defm SUST_B_2D_ARRAY_V4I32_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.clamp", B32>; - -defm SUST_B_2D_ARRAY_V4I8_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.trap", B16>; -defm SUST_B_2D_ARRAY_V4I16_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.trap", B16>; -defm SUST_B_2D_ARRAY_V4I32_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.trap", B32>; - -defm SUST_B_2D_ARRAY_V4I8_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.zero", B16>; -defm SUST_B_2D_ARRAY_V4I16_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.zero", B16>; -defm SUST_B_2D_ARRAY_V4I32_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.zero", B32>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_2D_ARRAY_V4I8_ # op_upper : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8." # op, B16>; + defm SUST_B_2D_ARRAY_V4I16_ # op_upper : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16." # op, B16>; + defm SUST_B_2D_ARRAY_V4I32_ # op_upper : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32." # op, B32>; +} defm SUST_P_2D_ARRAY_V4I8_TRAP : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b8.trap", B16>; defm SUST_P_2D_ARRAY_V4I16_TRAP : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b16.trap", B16>; @@ -4429,21 +4222,13 @@ multiclass SUST_3D<string inst, NVPTXRegClass intype> { def _I : SUST_3D_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_3D_I8_CLAMP : SUST_3D<"sust.b.3d.b8.clamp", B16>; -defm SUST_B_3D_I16_CLAMP : SUST_3D<"sust.b.3d.b16.clamp", B16>; -defm SUST_B_3D_I32_CLAMP : SUST_3D<"sust.b.3d.b32.clamp", B32>; -defm SUST_B_3D_I64_CLAMP : SUST_3D<"sust.b.3d.b64.clamp", B64>; - -defm SUST_B_3D_I8_TRAP : SUST_3D<"sust.b.3d.b8.trap", B16>; -defm SUST_B_3D_I16_TRAP : SUST_3D<"sust.b.3d.b16.trap", B16>; -defm SUST_B_3D_I32_TRAP : SUST_3D<"sust.b.3d.b32.trap", B32>; -defm SUST_B_3D_I64_TRAP : SUST_3D<"sust.b.3d.b64.trap", B64>; - -defm SUST_B_3D_I8_ZERO : SUST_3D<"sust.b.3d.b8.zero", B16>; -defm SUST_B_3D_I16_ZERO : SUST_3D<"sust.b.3d.b16.zero", B16>; -defm SUST_B_3D_I32_ZERO : SUST_3D<"sust.b.3d.b32.zero", B32>; -defm SUST_B_3D_I64_ZERO : SUST_3D<"sust.b.3d.b64.zero", B64>; - +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_3D_I8_ # op_upper : SUST_3D<"sust.b.3d.b8." # op, B16>; + defm SUST_B_3D_I16_ # op_upper : SUST_3D<"sust.b.3d.b16." # op, B16>; + defm SUST_B_3D_I32_ # op_upper : SUST_3D<"sust.b.3d.b32." # op, B32>; + defm SUST_B_3D_I64_ # op_upper : SUST_3D<"sust.b.3d.b64." # op, B64>; +} defm SUST_P_3D_I8_TRAP : SUST_3D<"sust.p.3d.b8.trap", B16>; defm SUST_P_3D_I16_TRAP : SUST_3D<"sust.p.3d.b16.trap", B16>; defm SUST_P_3D_I32_TRAP : SUST_3D<"sust.p.3d.b32.trap", B32>; @@ -4462,21 +4247,13 @@ multiclass SUST_3D_V2<string inst, NVPTXRegClass intype> { def _I : SUST_3D_V2_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_3D_V2I8_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b8.clamp", B16>; -defm SUST_B_3D_V2I16_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b16.clamp", B16>; -defm SUST_B_3D_V2I32_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b32.clamp", B32>; -defm SUST_B_3D_V2I64_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b64.clamp", B64>; - -defm SUST_B_3D_V2I8_TRAP : SUST_3D_V2<"sust.b.3d.v2.b8.trap", B16>; -defm SUST_B_3D_V2I16_TRAP : SUST_3D_V2<"sust.b.3d.v2.b16.trap", B16>; -defm SUST_B_3D_V2I32_TRAP : SUST_3D_V2<"sust.b.3d.v2.b32.trap", B32>; -defm SUST_B_3D_V2I64_TRAP : SUST_3D_V2<"sust.b.3d.v2.b64.trap", B64>; - -defm SUST_B_3D_V2I8_ZERO : SUST_3D_V2<"sust.b.3d.v2.b8.zero", B16>; -defm SUST_B_3D_V2I16_ZERO : SUST_3D_V2<"sust.b.3d.v2.b16.zero", B16>; -defm SUST_B_3D_V2I32_ZERO : SUST_3D_V2<"sust.b.3d.v2.b32.zero", B32>; -defm SUST_B_3D_V2I64_ZERO : SUST_3D_V2<"sust.b.3d.v2.b64.zero", B64>; - +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_3D_V2I8_ # op_upper : SUST_3D_V2<"sust.b.3d.v2.b8." # op, B16>; + defm SUST_B_3D_V2I16_ # op_upper : SUST_3D_V2<"sust.b.3d.v2.b16." # op, B16>; + defm SUST_B_3D_V2I32_ # op_upper : SUST_3D_V2<"sust.b.3d.v2.b32." # op, B32>; + defm SUST_B_3D_V2I64_ # op_upper : SUST_3D_V2<"sust.b.3d.v2.b64." # op, B64>; +} defm SUST_P_3D_V2I8_TRAP : SUST_3D_V2<"sust.p.3d.v2.b8.trap", B16>; defm SUST_P_3D_V2I16_TRAP : SUST_3D_V2<"sust.p.3d.v2.b16.trap", B16>; defm SUST_P_3D_V2I32_TRAP : SUST_3D_V2<"sust.p.3d.v2.b32.trap", B32>; @@ -4495,17 +4272,12 @@ multiclass SUST_3D_V4<string inst, NVPTXRegClass intype> { def _I : SUST_3D_V4_base<inst, intype, (ins i64imm:$s), []>; } -defm SUST_B_3D_V4I8_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b8.clamp", B16>; -defm SUST_B_3D_V4I16_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b16.clamp", B16>; -defm SUST_B_3D_V4I32_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b32.clamp", B32>; - -defm SUST_B_3D_V4I8_TRAP : SUST_3D_V4<"sust.b.3d.v4.b8.trap", B16>; -defm SUST_B_3D_V4I16_TRAP : SUST_3D_V4<"sust.b.3d.v4.b16.trap", B16>; -defm SUST_B_3D_V4I32_TRAP : SUST_3D_V4<"sust.b.3d.v4.b32.trap", B32>; - -defm SUST_B_3D_V4I8_ZERO : SUST_3D_V4<"sust.b.3d.v4.b8.zero", B16>; -defm SUST_B_3D_V4I16_ZERO : SUST_3D_V4<"sust.b.3d.v4.b16.zero", B16>; -defm SUST_B_3D_V4I32_ZERO : SUST_3D_V4<"sust.b.3d.v4.b32.zero", B32>; +foreach op = ["clamp", "trap", "zero"] in { + defvar op_upper = !toupper(op); + defm SUST_B_3D_V4I8_ # op_upper : SUST_3D_V4<"sust.b.3d.v4.b8." # op, B16>; + defm SUST_B_3D_V4I16_ # op_upper : SUST_3D_V4<"sust.b.3d.v4.b16." # op, B16>; + defm SUST_B_3D_V4I32_ # op_upper : SUST_3D_V4<"sust.b.3d.v4.b32." # op, B32>; +} defm SUST_P_3D_V4I8_TRAP : SUST_3D_V4<"sust.p.3d.v4.b8.trap", B16>; defm SUST_P_3D_V4I16_TRAP : SUST_3D_V4<"sust.p.3d.v4.b16.trap", B16>; @@ -5122,27 +4894,23 @@ defm INT_SET_MAXNREG_DEC : SET_MAXNREG<"dec", int_nvvm_setmaxnreg_dec_sync_align // // WGMMA fence instructions // -let isConvergent = true in { -def INT_NVVM_WGMMA_FENCE_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins), "wgmma.fence.sync.aligned", - [(int_nvvm_wgmma_fence_sync_aligned)]>, Requires<[hasSM90a, hasPTX<80>]>; +let isConvergent = true, Predicates = [hasSM90a, hasPTX<80>] in { + def WGMMA_FENCE_SYNC_ALIGNED : NullaryInst<"wgmma.fence.sync.aligned", int_nvvm_wgmma_fence_sync_aligned>; -def INT_NVVM_WGMMA_COMMIT_GROUP_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins), "wgmma.commit_group.sync.aligned", - [(int_nvvm_wgmma_commit_group_sync_aligned)]>, Requires<[hasSM90a, hasPTX<80>]>; + def WGMMA_COMMIT_GROUP_SYNC_ALIGNED : NullaryInst<"wgmma.commit_group.sync.aligned", int_nvvm_wgmma_commit_group_sync_aligned>; -def INT_NVVM_WGMMA_WAIT_GROUP_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins i64imm:$n), "wgmma.wait_group.sync.aligned", - [(int_nvvm_wgmma_wait_group_sync_aligned timm:$n)]>, Requires<[hasSM90a, hasPTX<80>]>; -} // isConvergent = true + def WGMMA_WAIT_GROUP_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins i64imm:$n), "wgmma.wait_group.sync.aligned", + [(int_nvvm_wgmma_wait_group_sync_aligned timm:$n)]>; +} let Predicates = [hasSM<90>, hasPTX<78>] in { def GRIDDEPCONTROL_LAUNCH_DEPENDENTS : - BasicNVPTXInst<(outs), (ins), "griddepcontrol.launch_dependents", - [(int_nvvm_griddepcontrol_launch_dependents)]>; + NullaryInst<"griddepcontrol.launch_dependents", int_nvvm_griddepcontrol_launch_dependents>; def GRIDDEPCONTROL_WAIT : - BasicNVPTXInst<(outs), (ins), "griddepcontrol.wait", - [(int_nvvm_griddepcontrol_wait)]>; + NullaryInst<"griddepcontrol.wait", int_nvvm_griddepcontrol_wait>; } -def INT_EXIT : BasicNVPTXInst<(outs), (ins), "exit", [(int_nvvm_exit)]>; +def EXIT : NullaryInst<"exit", int_nvvm_exit>; // Tcgen05 intrinsics let isConvergent = true, Predicates = [hasTcgen05Instructions] in { @@ -5170,9 +4938,7 @@ defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1 defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>; multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> { - def "" : BasicNVPTXInst<(outs), (ins), - "tcgen05.relinquish_alloc_permit.cta_group::" # num # ".sync.aligned", - [(Intr)]>; + def "" : NullaryInst<"tcgen05.relinquish_alloc_permit.cta_group::" # num # ".sync.aligned", Intr>; } defm TCGEN05_RELINQ_CG1: TCGEN05_RELINQ_PERMIT_INTR<"1", int_nvvm_tcgen05_relinq_alloc_permit_cg1>; defm TCGEN05_RELINQ_CG2: TCGEN05_RELINQ_PERMIT_INTR<"2", int_nvvm_tcgen05_relinq_alloc_permit_cg2>; diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index f4362fe..e2bbe57 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -412,6 +412,22 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS, } } +// Create a call to the nvvm_internal_addrspace_wrap intrinsic and set the +// alignment of the return value based on the alignment of the argument. +static CallInst *createNVVMInternalAddrspaceWrap(IRBuilder<> &IRB, + Argument &Arg) { + CallInst *ArgInParam = + IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap, + {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()}, + &Arg, {}, Arg.getName() + ".param"); + + if (MaybeAlign ParamAlign = Arg.getParamAlign()) + ArgInParam->addRetAttr( + Attribute::getWithAlignment(ArgInParam->getContext(), *ParamAlign)); + + return ArgInParam; +} + namespace { struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> { using Base = PtrUseVisitor<ArgUseChecker>; @@ -515,10 +531,7 @@ void copyByValParam(Function &F, Argument &Arg) { Arg.getParamAlign().value_or(DL.getPrefTypeAlign(StructType))); Arg.replaceAllUsesWith(AllocA); - Value *ArgInParam = - IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap, - {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()}, - &Arg, {}, Arg.getName()); + CallInst *ArgInParam = createNVVMInternalAddrspaceWrap(IRB, Arg); // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX // addrspacecast preserves alignment. Since params are constant, this load @@ -549,9 +562,7 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) { SmallVector<Use *, 16> UsesToUpdate(llvm::make_pointer_range(Arg->uses())); IRBuilder<> IRB(&*FirstInst); - Value *ArgInParamAS = IRB.CreateIntrinsic( - Intrinsic::nvvm_internal_addrspace_wrap, - {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()}, {Arg}); + CallInst *ArgInParamAS = createNVVMInternalAddrspaceWrap(IRB, *Arg); for (Use *U : UsesToUpdate) convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant); @@ -581,10 +592,7 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) { // argument already in the param address space, we need to use the noop // intrinsic, this had the added benefit of preventing other optimizations // from folding away this pair of addrspacecasts. - auto *ParamSpaceArg = - IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap, - {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()}, - Arg, {}, Arg->getName() + ".param"); + auto *ParamSpaceArg = createNVVMInternalAddrspaceWrap(IRB, *Arg); // Cast param address to generic address space. Value *GenericArg = IRB.CreateAddrSpaceCast( diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp index 46aa27e..c8e576f 100644 --- a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp +++ b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp @@ -93,7 +93,7 @@ static bool clobbersCTR(const MachineInstr &MI) { static bool verifyCTRBranch(MachineBasicBlock *MBB, MachineBasicBlock::iterator I) { MachineBasicBlock::iterator BI = I; - SmallSet<MachineBasicBlock *, 16> Visited; + SmallPtrSet<MachineBasicBlock *, 16> Visited; SmallVector<MachineBasicBlock *, 8> Preds; bool CheckPreds; diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.h b/llvm/lib/Target/PowerPC/PPCCallingConv.h index ab61472..9c47142 100644 --- a/llvm/lib/Target/PowerPC/PPCCallingConv.h +++ b/llvm/lib/Target/PowerPC/PPCCallingConv.h @@ -21,28 +21,29 @@ namespace llvm { bool RetCC_PPC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool RetCC_PPC64_ELF_FIS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool RetCC_PPC_Cold(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_PPC32_SVR4(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_PPC64_ELF(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_PPC64_ELF_FIS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_PPC32_SVR4_ByVal(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_PPC32_SVR4_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); } // End llvm namespace diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp index e92e00f..0b68ba1 100644 --- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -1374,7 +1374,10 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value *> &Args, unsigned LinkageSize = Subtarget->getFrameLowering()->getLinkageSize(); CCInfo.AllocateStack(LinkageSize, Align(8)); - CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS); + SmallVector<Type *, 16> ArgTys; + for (Value *Arg : Args) + ArgTys.push_back(Arg->getType()); + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, ArgTys, CC_PPC64_ELF_FIS); // Bail out if we can't handle any of the arguments. for (const CCValAssign &VA : ArgLocs) { @@ -1487,7 +1490,7 @@ bool PPCFastISel::finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumByte if (RetVT != MVT::isVoid) { SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context); - CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS); + CCInfo.AnalyzeCallResult(RetVT, CLI.RetTy, RetCC_PPC64_ELF_FIS); CCValAssign &VA = RVLocs[0]; assert(RVLocs.size() == 1 && "No support for multi-reg return values!"); assert(VA.isRegLoc() && "Can only return in registers!"); @@ -1573,7 +1576,7 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) { RetVT != MVT::f64) { SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, *Context); - CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS); + CCInfo.AnalyzeCallResult(RetVT, RetTy, RetCC_PPC64_ELF_FIS); if (RVLocs.size() > 1) return false; } diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index c0860fc..2ad3ed2 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -2078,8 +2078,10 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, // tail call might not be in the new RestoreBlock, so real branch instruction // won't be generated by emitEpilogue(), because shrink-wrap has chosen new // RestoreBlock. So we handle this case here. - if (MFI.getSavePoint() && MFI.hasTailCall()) { - MachineBasicBlock *RestoreBlock = MFI.getRestorePoint(); + if (!MFI.getSavePoints().empty() && MFI.hasTailCall()) { + assert(MFI.getRestorePoints().size() < 2 && + "MFI can't contain multiple restore points!"); + MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front(); for (MachineBasicBlock &MBB : MF) { if (MBB.isReturnBlock() && (&MBB) != RestoreBlock) createTailCallBranchInstr(MBB); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 2698bd6..652edd4 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1787,11 +1787,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; case PPCISD::PADDI_DTPREL: return "PPCISD::PADDI_DTPREL"; - case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; - case PPCISD::SC: return "PPCISD::SC"; - case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB"; - case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE"; - case PPCISD::RFEBB: return "PPCISD::RFEBB"; + case PPCISD::VADD_SPLAT: + return "PPCISD::VADD_SPLAT"; case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; @@ -4051,18 +4048,13 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - - Entry.Ty = IntPtrTy; - Entry.Node = Trmp; Args.push_back(Entry); - + Args.emplace_back(Trmp, IntPtrTy); // TrampSize == (isPPC64 ? 48 : 40); - Entry.Node = - DAG.getConstant(isPPC64 ? 48 : 40, dl, Subtarget.getScalarIntVT()); - Args.push_back(Entry); - - Entry.Node = FPtr; Args.push_back(Entry); - Entry.Node = Nest; Args.push_back(Entry); + Args.emplace_back( + DAG.getConstant(isPPC64 ? 48 : 40, dl, Subtarget.getScalarIntVT()), + IntPtrTy); + Args.emplace_back(FPtr, IntPtrTy); + Args.emplace_back(Nest, IntPtrTy); // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) TargetLowering::CallLoweringInfo CLI(DAG); @@ -6091,10 +6083,10 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( if (!ArgFlags.isVarArg()) { Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, - CCInfo); + Outs[i].OrigTy, CCInfo); } else { Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full, - ArgFlags, CCInfo); + ArgFlags, Outs[i].OrigTy, CCInfo); } if (Result) { @@ -6905,7 +6897,7 @@ static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) { static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State) { + Type *OrigTy, CCState &State) { const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>( State.getMachineFunction().getSubtarget()); const bool IsPPC64 = Subtarget.isPPC64(); @@ -14822,9 +14814,9 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { SDValue Chain = LD->getChain(); EVT VT = LD->getMemoryVT(); - SmallSet<SDNode *, 16> LoadRoots; + SmallPtrSet<SDNode *, 16> LoadRoots; SmallVector<SDNode *, 8> Queue(1, Chain.getNode()); - SmallSet<SDNode *, 16> Visited; + SmallPtrSet<SDNode *, 16> Visited; // First, search up the chain, branching to follow all token-factor operands. // If we find a consecutive load, then we're done, otherwise, record all @@ -19553,12 +19545,10 @@ SDValue PPCTargetLowering::lowerToLibCall(const char *LibCallName, SDValue Op, DAG.getExternalSymbol(LibCallName, TLI.getPointerTy(DAG.getDataLayout())); bool SignExtend = TLI.shouldSignExtendTypeInLibCall(RetTy, false); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; for (const SDValue &N : Op->op_values()) { EVT ArgVT = N.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - Entry.Node = N; - Entry.Ty = ArgTy; + TargetLowering::ArgListEntry Entry(N, ArgTy); Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgTy, SignExtend); Entry.IsZExt = !Entry.IsSExt; Args.push_back(Entry); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 9755f0e..5e0d6bf 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -430,20 +430,6 @@ namespace llvm { /// optimizations due to constant folding. VADD_SPLAT, - /// CHAIN = SC CHAIN, Imm128 - System call. The 7-bit unsigned - /// operand identifies the operating system entry point. - SC, - - /// CHAIN = CLRBHRB CHAIN - Clear branch history rolling buffer. - CLRBHRB, - - /// GPRC, CHAIN = MFBHRBE CHAIN, Entry, Dummy - Move from branch - /// history rolling buffer entry. - MFBHRBE, - - /// CHAIN = RFEBB CHAIN, State - Return from event-based branch. - RFEBB, - /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little /// endian. Maps to an xxswapd instruction that corrects an lxvd2x /// or stxvd2x instruction. The chain is necessary because the diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td index 24287a9..79fe12e 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -1630,9 +1630,11 @@ def BCDCTSQ_rec : VX_VT5_EO5_VB5_XO9_o <0, 385, "bcdctsq.", []>; // Decimal Copy-Sign/Set-Sign let Defs = [CR6] in -def BCDCPSGN_rec : VX1_VT5_VA5_VB5<833, "bcdcpsgn.", []>; +def BCDCPSGN_rec : VX1_VT5_VA5_VB5<833, "bcdcpsgn.", + [(set v16i8:$VD, (int_ppc_bcdcopysign v16i8:$VA, v16i8:$VB))]>; -def BCDSETSGN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", []>; +def BCDSETSGN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", + [(set v16i8:$VD, (int_ppc_bcdsetsign v16i8:$VB, i32:$PS))]>; // Decimal Shift/Unsigned-Shift/Shift-and-Round def BCDS_rec : VX_VT5_VA5_VB5_PS1_XO9_o<193, "bcds." , []>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 99ef89a..c2f91ce 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -365,16 +365,6 @@ def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPSideEffect]>; -def SDT_PPCsc : SDTypeProfile<0, 1, [SDTCisInt<0>]>; -def PPCsc : SDNode<"PPCISD::SC", SDT_PPCsc, - [SDNPHasChain, SDNPSideEffect]>; - -def PPCclrbhrb : SDNode<"PPCISD::CLRBHRB", SDTNone, - [SDNPHasChain, SDNPSideEffect]>; -def PPCmfbhrbe : SDNode<"PPCISD::MFBHRBE", SDTIntBinOp, [SDNPHasChain]>; -def PPCrfebb : SDNode<"PPCISD::RFEBB", SDT_PPCsc, - [SDNPHasChain, SDNPSideEffect]>; - def PPCvcmp : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>; def PPCvcmp_rec : SDNode<"PPCISD::VCMP_rec", SDT_PPCvcmp, [SDNPOutGlue]>; @@ -1673,7 +1663,7 @@ let isBranch = 1, isTerminator = 1, Size = 0 in { // System call. let PPC970_Unit = 7 in { def SC : SCForm<17, 1, 0, (outs), (ins i32imm:$LEV), - "sc $LEV", IIC_BrB, [(PPCsc (i32 imm:$LEV))]>; + "sc $LEV", IIC_BrB, []>; } // We mark SCV as having no scheduling model since it is only meant to be used @@ -1685,21 +1675,14 @@ let Predicates = [IsISA3_0], hasNoSchedulingInfo = 1 in { } // Branch history rolling buffer. -def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB, - [(PPCclrbhrb)]>, +def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB, []>, PPC970_DGroup_Single; -// The $dmy argument used for MFBHRBE is not needed; however, including -// it avoids automatic generation of PPCFastISel::fastEmit_i(), which -// interferes with necessary special handling (see PPCFastISel.cpp). -def MFBHRBE : XFXForm_3p<31, 302, (outs gprc:$RT), - (ins u10imm:$imm, u10imm:$dmy), - "mfbhrbe $RT, $imm", IIC_BrB, - [(set i32:$RT, - (PPCmfbhrbe imm:$imm, imm:$dmy))]>, + +def MFBHRBE : XFXForm_3p<31, 302, (outs gprc:$RT), (ins u10imm:$imm), + "mfbhrbe $RT, $imm", IIC_BrB, []>, PPC970_DGroup_First; -def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$S), "rfebb $S", - IIC_BrB, [(PPCrfebb (i32 imm:$S))]>, +def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$S), "rfebb $S", IIC_BrB, []>, PPC970_DGroup_Single; def : InstAlias<"rfebb", (RFEBB 1)>; diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp index 709d7e7..adf9436 100644 --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -264,9 +264,8 @@ namespace { bool prepareBasesForCommoningChains(Bucket &BucketChain); /// Rewrite load/store according to the common chains. - bool - rewriteLoadStoresForCommoningChains(Loop *L, Bucket &Bucket, - SmallSet<BasicBlock *, 16> &BBChanged); + bool rewriteLoadStoresForCommoningChains( + Loop *L, Bucket &Bucket, SmallPtrSet<BasicBlock *, 16> &BBChanged); /// Collect condition matched(\p isValidCandidate() returns true) /// candidates in Loop \p L. @@ -309,7 +308,7 @@ namespace { /// Rewrite load/store instructions in \p BucketChain according to /// preparation. bool rewriteLoadStores(Loop *L, Bucket &BucketChain, - SmallSet<BasicBlock *, 16> &BBChanged, + SmallPtrSet<BasicBlock *, 16> &BBChanged, PrepForm Form); /// Rewrite for the base load/store of a chain. @@ -523,7 +522,7 @@ bool PPCLoopInstrFormPrep::chainCommoning(Loop *L, if (Buckets.empty()) return MadeChange; - SmallSet<BasicBlock *, 16> BBChanged; + SmallPtrSet<BasicBlock *, 16> BBChanged; for (auto &Bucket : Buckets) { if (prepareBasesForCommoningChains(Bucket)) @@ -537,7 +536,7 @@ bool PPCLoopInstrFormPrep::chainCommoning(Loop *L, } bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains( - Loop *L, Bucket &Bucket, SmallSet<BasicBlock *, 16> &BBChanged) { + Loop *L, Bucket &Bucket, SmallPtrSet<BasicBlock *, 16> &BBChanged) { bool MadeChange = false; assert(Bucket.Elements.size() == @@ -1006,7 +1005,7 @@ bool PPCLoopInstrFormPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) { } bool PPCLoopInstrFormPrep::rewriteLoadStores( - Loop *L, Bucket &BucketChain, SmallSet<BasicBlock *, 16> &BBChanged, + Loop *L, Bucket &BucketChain, SmallPtrSet<BasicBlock *, 16> &BBChanged, PrepForm Form) { bool MadeChange = false; @@ -1089,7 +1088,7 @@ bool PPCLoopInstrFormPrep::updateFormPrep(Loop *L, bool MadeChange = false; if (Buckets.empty()) return MadeChange; - SmallSet<BasicBlock *, 16> BBChanged; + SmallPtrSet<BasicBlock *, 16> BBChanged; for (auto &Bucket : Buckets) // The base address of each bucket is transformed into a phi and the others // are rewritten based on new base. @@ -1110,7 +1109,7 @@ bool PPCLoopInstrFormPrep::dispFormPrep(Loop *L, if (Buckets.empty()) return MadeChange; - SmallSet<BasicBlock *, 16> BBChanged; + SmallPtrSet<BasicBlock *, 16> BBChanged; for (auto &Bucket : Buckets) { if (Bucket.Elements.size() < DispFormPrepMinThreshold) continue; diff --git a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp index 0ffd35d..74bce43 100644 --- a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp +++ b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp @@ -248,6 +248,10 @@ static bool splitMBB(BlockSplitInfo &BSI) { } addIncomingValuesToPHIs(NewBRTarget, ThisMBB, NewMBB, MRI); + // Set the call frame size on ThisMBB to the new basic blocks. + // See https://reviews.llvm.org/D156113. + NewMBB->setCallFrameSize(TII->getCallFrameSizeAt(ThisMBB->back())); + LLVM_DEBUG(dbgs() << "After splitting, ThisMBB:\n"; ThisMBB->dump()); LLVM_DEBUG(dbgs() << "NewMBB:\n"; NewMBB->dump()); LLVM_DEBUG(dbgs() << "New branch-to block:\n"; NewBRTarget->dump()); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 76dca47..f123040 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -1102,13 +1102,20 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, SpillsKnownBit = true; break; default: + // When spilling a CR bit, the super register may not be explicitly defined + // (i.e. it can be defined by a CR-logical that only defines the subreg) so + // we state that the CR field is undef. Also, in order to preserve the kill + // flag on the CR bit, we add it as an implicit use. + // On Power10, we can use SETNBC to spill all CR bits. SETNBC will set all // bits (specifically, it produces a -1 if the CR bit is set). Ultimately, // the bit that is of importance to us is bit 32 (bit 0 of a 32-bit // register), and SETNBC will set this. if (Subtarget.isISA3_1()) { BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETNBC8 : PPC::SETNBC), Reg) - .addReg(SrcReg, RegState::Undef); + .addReg(SrcReg, RegState::Undef) + .addReg(SrcReg, RegState::Implicit | + getKillRegState(MI.getOperand(0).isKill())); break; } @@ -1122,16 +1129,14 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, SrcReg == PPC::CR4LT || SrcReg == PPC::CR5LT || SrcReg == PPC::CR6LT || SrcReg == PPC::CR7LT) { BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETB8 : PPC::SETB), Reg) - .addReg(getCRFromCRBit(SrcReg), RegState::Undef); + .addReg(getCRFromCRBit(SrcReg), RegState::Undef) + .addReg(SrcReg, RegState::Implicit | + getKillRegState(MI.getOperand(0).isKill())); break; } } // We need to move the CR field that contains the CR bit we are spilling. - // The super register may not be explicitly defined (i.e. it can be defined - // by a CR-logical that only defines the subreg) so we state that the CR - // field is undef. Also, in order to preserve the kill flag on the CR bit, - // we add it as an implicit use. BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg) .addReg(getCRFromCRBit(SrcReg), RegState::Undef) .addReg(SrcReg, diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index d71c42c..d37ae2f 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -903,6 +903,7 @@ public: VK == RISCV::S_QC_ABS20; } + bool isSImm8Unsigned() const { return isSImm<8>() || isUImm<8>(); } bool isSImm10Unsigned() const { return isSImm<10>() || isUImm<10>(); } bool isUImm20LUI() const { @@ -1199,6 +1200,14 @@ public: addExpr(Inst, getImm(), isRV64Imm()); } + void addSImm8UnsignedOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + int64_t Imm; + [[maybe_unused]] bool IsConstant = evaluateConstantImm(getImm(), Imm); + assert(IsConstant); + Inst.addOperand(MCOperand::createImm(SignExtend64<8>(Imm))); + } + void addSImm10UnsignedOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); int64_t Imm; @@ -1547,6 +1556,9 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return generateImmOutOfRangeError( Operands, ErrorInfo, 0, (1 << 9) - 8, "immediate must be a multiple of 8 bytes in the range"); + case Match_InvalidSImm8Unsigned: + return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 7), + (1 << 8) - 1); case Match_InvalidSImm10: return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 9), (1 << 9) - 1); diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index e0ac591..78be55b 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -672,6 +672,8 @@ static constexpr FeatureBitset XAndesGroup = { RISCV::FeatureVendorXAndesVSIntLoad, RISCV::FeatureVendorXAndesVPackFPH, RISCV::FeatureVendorXAndesVDot}; +static constexpr FeatureBitset XSMTGroup = {RISCV::FeatureVendorXSMTVDot}; + static constexpr DecoderListEntry DecoderList32[]{ // Vendor Extensions {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"}, @@ -692,6 +694,7 @@ static constexpr DecoderListEntry DecoderList32[]{ {RISCV::FeatureVendorXMIPSCBOP}, "MIPS mips.pref"}, {DecoderTableXAndes32, XAndesGroup, "Andes extensions"}, + {DecoderTableXSMT32, XSMTGroup, "SpacemiT extensions"}, // Standard Extensions {DecoderTable32, {}, "standard 32-bit instructions"}, {DecoderTableRV32Only32, {}, "RV32-only standard 32-bit instructions"}, diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index f83c2b6..51ea3fc 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -736,7 +736,6 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { } case TargetOpcode::G_FCONSTANT: { // TODO: Use constant pool for complex constants. - // TODO: Optimize +0.0 to use fcvt.d.w for s64 on rv32. Register DstReg = MI.getOperand(0).getReg(); const APFloat &FPimm = MI.getOperand(1).getFPImm()->getValueAPF(); APInt Imm = FPimm.bitcastToAPInt(); @@ -753,8 +752,22 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { if (!FMV.constrainAllUses(TII, TRI, RBI)) return false; } else { + // s64 on rv32 assert(Size == 64 && !Subtarget->is64Bit() && "Unexpected size or subtarget"); + + if (Imm.isNonNegative() && Imm.isZero()) { + // Optimize +0.0 to use fcvt.d.w + MachineInstrBuilder FCVT = + MIB.buildInstr(RISCV::FCVT_D_W, {DstReg}, {Register(RISCV::X0)}) + .addImm(RISCVFPRndMode::RNE); + if (!FCVT.constrainAllUses(TII, TRI, RBI)) + return false; + + MI.eraseFromParent(); + return true; + } + // Split into two pieces and build through the stack. Register GPRRegHigh = MRI->createVirtualRegister(&RISCV::GPRRegClass); Register GPRRegLow = MRI->createVirtualRegister(&RISCV::GPRRegClass); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 8d956ce..96f22c2 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -819,6 +819,23 @@ void RISCVAsmBackend::maybeAddVendorReloc(const MCFragment &F, Asm->getWriter().recordRelocation(F, VendorFixup, VendorTarget, VendorValue); } +static bool relaxableFixupNeedsRelocation(const MCFixupKind Kind) { + // Some Fixups are marked as LinkerRelaxable by + // `RISCVMCCodeEmitter::getImmOpValue` only because they may be + // (assembly-)relaxed into a linker-relaxable instruction. This function + // should return `false` for those fixups so they do not get a `R_RISCV_RELAX` + // relocation emitted in addition to the relocation. + switch (Kind) { + default: + break; + case RISCV::fixup_riscv_rvc_jump: + case RISCV::fixup_riscv_rvc_branch: + case RISCV::fixup_riscv_jal: + return false; + } + return true; +} + bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup, const MCValue &Target, uint64_t &FixedValue, bool IsResolved) { @@ -861,25 +878,32 @@ bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup, return false; } - // If linker relaxation is enabled and supported by the current relocation, - // generate a relocation and then append a RELAX. - if (Fixup.isLinkerRelaxable()) + // If linker relaxation is enabled and supported by the current fixup, then we + // always want to generate a relocation. + bool NeedsRelax = Fixup.isLinkerRelaxable() && + relaxableFixupNeedsRelocation(Fixup.getKind()); + if (NeedsRelax) IsResolved = false; + if (IsResolved && Fixup.isPCRel()) IsResolved = isPCRelFixupResolved(Target.getAddSym(), F); if (!IsResolved) { - // Some Fixups require a vendor relocation, record it (directly) before we + // Some Fixups require a VENDOR relocation, record it (directly) before we // add the relocation. maybeAddVendorReloc(F, Fixup); Asm->getWriter().recordRelocation(F, Fixup, Target, FixedValue); - } - if (Fixup.isLinkerRelaxable()) { - auto FA = MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_RISCV_RELAX); - Asm->getWriter().recordRelocation(F, FA, MCValue::get(nullptr), - FixedValueA); + if (NeedsRelax) { + // Some Fixups get a RELAX relocation, record it (directly) after we add + // the relocation. + MCFixup RelaxFixup = + MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_RISCV_RELAX); + MCValue RelaxTarget = MCValue::get(nullptr); + uint64_t RelaxValue; + Asm->getWriter().recordRelocation(F, RelaxFixup, RelaxTarget, RelaxValue); + } } return false; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index bddea43..083ac05 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -139,6 +139,9 @@ enum { // 3 -> SEW * 4 DestEEWShift = ElementsDependOnMaskShift + 1, DestEEWMask = 3ULL << DestEEWShift, + + ReadsPastVLShift = DestEEWShift + 2, + ReadsPastVLMask = 1ULL << ReadsPastVLShift, }; // Helper functions to read TSFlags. @@ -195,6 +198,12 @@ static inline bool elementsDependOnMask(uint64_t TSFlags) { return TSFlags & ElementsDependOnMaskMask; } +/// \returns true if the instruction may read elements past VL, e.g. +/// vslidedown/vrgather +static inline bool readsPastVL(uint64_t TSFlags) { + return TSFlags & ReadsPastVLMask; +} + static inline unsigned getVLOpNum(const MCInstrDesc &Desc) { const uint64_t TSFlags = Desc.TSFlags; // This method is only called if we expect to have a VL operand, and all @@ -337,6 +346,7 @@ enum OperandType : unsigned { OPERAND_SIMM5_PLUS1, OPERAND_SIMM6, OPERAND_SIMM6_NONZERO, + OPERAND_SIMM8, OPERAND_SIMM10, OPERAND_SIMM10_LSB0000_NONZERO, OPERAND_SIMM11, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index 8c9ab8e..b0c27ce 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -75,7 +75,7 @@ void RISCVInstPrinter::printInst(const MCInst *MI, uint64_t Address, if (PrintAliases && !NoAliases) Res = RISCVRVC::uncompress(UncompressedMI, *MI, STI); if (Res) - NewMI = const_cast<MCInst *>(&UncompressedMI); + NewMI = &UncompressedMI; if (!PrintAliases || NoAliases || !printAliasInstr(NewMI, Address, STI, O)) printInstruction(NewMI, Address, STI, O); printAnnotation(O, Annot); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index cbeabdd..717fba6 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -576,8 +576,21 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, "getImmOpValue expects only expressions or immediates"); const MCExpr *Expr = MO.getExpr(); MCExpr::ExprKind Kind = Expr->getKind(); - unsigned FixupKind = RISCV::fixup_riscv_invalid; + + // `RelaxCandidate` must be set to `true` in two cases: + // - The fixup's relocation gets a R_RISCV_RELAX relocation + // - The underlying instruction may be relaxed to an instruction that gets a + // `R_RISCV_RELAX` relocation. + // + // The actual emission of `R_RISCV_RELAX` will be handled in + // `RISCVAsmBackend::applyFixup`. bool RelaxCandidate = false; + auto AsmRelaxToLinkerRelaxableWithFeature = [&](unsigned Feature) -> void { + if (!STI.hasFeature(RISCV::FeatureExactAssembly) && STI.hasFeature(Feature)) + RelaxCandidate = true; + }; + + unsigned FixupKind = RISCV::fixup_riscv_invalid; if (Kind == MCExpr::Specifier) { const auto *RVExpr = cast<MCSpecifierExpr>(Expr); FixupKind = RVExpr->getSpecifier(); @@ -644,18 +657,26 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, // FIXME: Sub kind binary exprs have chance of underflow. if (MIFrm == RISCVII::InstFormatJ) { FixupKind = RISCV::fixup_riscv_jal; + AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcilb); } else if (MIFrm == RISCVII::InstFormatB) { FixupKind = RISCV::fixup_riscv_branch; + // This might be assembler relaxed to `b<cc>; jal` but we cannot relax + // the `jal` again in the assembler. } else if (MIFrm == RISCVII::InstFormatCJ) { FixupKind = RISCV::fixup_riscv_rvc_jump; + AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcilb); } else if (MIFrm == RISCVII::InstFormatCB) { FixupKind = RISCV::fixup_riscv_rvc_branch; + // This might be assembler relaxed to `b<cc>; jal` but we cannot relax + // the `jal` again in the assembler. } else if (MIFrm == RISCVII::InstFormatCI) { FixupKind = RISCV::fixup_riscv_rvc_imm; } else if (MIFrm == RISCVII::InstFormatI) { FixupKind = RISCV::fixup_riscv_12_i; } else if (MIFrm == RISCVII::InstFormatQC_EB) { FixupKind = RISCV::fixup_riscv_qc_e_branch; + // This might be assembler relaxed to `qc.e.b<cc>; jal` but we cannot + // relax the `jal` again in the assembler. } else if (MIFrm == RISCVII::InstFormatQC_EAI) { FixupKind = RISCV::fixup_riscv_qc_e_32; RelaxCandidate = true; @@ -670,9 +691,9 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, assert(FixupKind != RISCV::fixup_riscv_invalid && "Unhandled expression!"); addFixup(Fixups, 0, Expr, FixupKind); - // If linker relaxation is enabled and supported by this relocation, set - // a bit so that if fixup is unresolved, a R_RISCV_RELAX relocation will be - // appended. + // If linker relaxation is enabled and supported by this relocation, set a bit + // so that the assembler knows the size of the instruction is not fixed/known, + // and the relocation will need a R_RISCV_RELAX relocation. if (EnableRelax && RelaxCandidate) Fixups.back().setLinkerRelaxable(); ++MCNumFixups; diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp index 70127e3..78f4779 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp @@ -741,7 +741,7 @@ bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT, bool llvm::CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State) { + Type *OrigTy, CCState &State) { if (ArgFlags.isNest()) { report_fatal_error( "Attribute 'nest' is not supported in GHC calling convention"); diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.h b/llvm/lib/Target/RISCV/RISCVCallingConv.h index 2030ce1..0847dd6 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.h +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.h @@ -33,7 +33,7 @@ bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT, bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); namespace RISCV { diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index a7329d2..d4ac3c6 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1055,13 +1055,13 @@ def FeatureStdExtSupm "Indicates User-mode Pointer Masking">; def FeatureStdExtSmctr - : RISCVExperimentalExtension<1, 0, - "Control Transfer Records Machine Level", - [FeatureStdExtSscsrind]>; + : RISCVExtension<1, 0, + "Control Transfer Records Machine Level", + [FeatureStdExtSscsrind]>; def FeatureStdExtSsctr - : RISCVExperimentalExtension<1, 0, - "Control Transfer Records Supervisor Level", - [FeatureStdExtSscsrind]>; + : RISCVExtension<1, 0, + "Control Transfer Records Supervisor Level", + [FeatureStdExtSscsrind]>; def HasStdExtSmctrOrSsctr : Predicate<"Subtarget->hasStdExtSmctrOrSsctr()">, AssemblerPredicate<(any_of FeatureStdExtSmctr, FeatureStdExtSsctr), "'Smctr' (Control Transfer Records Machine Level) or " @@ -1642,6 +1642,14 @@ def HasVendorXAndesVDot AssemblerPredicate<(all_of FeatureVendorXAndesVDot), "'XAndesVDot' (Andes Vector Dot Product Extension)">; +def FeatureVendorXSMTVDot + : RISCVExtension<1, 0, "SpacemiT Vector Dot Product Extension", + [FeatureStdExtZve32f]>; +def HasVendorXSMTVDot + : Predicate<"Subtarget->hasVendorXSMTVDot()">, + AssemblerPredicate<(all_of FeatureVendorXSMTVDot), + "'XSMTVDot' (SpacemiT Vector Dot Product Extension)">; + //===----------------------------------------------------------------------===// // LLVM specific features and extensions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 5998653..f9f35f6 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -18,6 +18,7 @@ #include "RISCVInstrInfo.h" #include "RISCVSelectionDAGInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Debug.h" @@ -681,40 +682,86 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) { if (!Subtarget->hasVendorXqcibm()) return false; - auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1)); - if (!N1C) + using namespace SDPatternMatch; + + SDValue X; + APInt MaskImm; + if (!sd_match(Node, m_Or(m_OneUse(m_Value(X)), m_ConstInt(MaskImm)))) return false; - int32_t C1 = N1C->getSExtValue(); - if (!isShiftedMask_32(C1) || isInt<12>(C1)) + unsigned ShAmt, Width; + if (!MaskImm.isShiftedMask(ShAmt, Width) || MaskImm.isSignedIntN(12)) return false; - // INSBI will clobber the input register in N0. Bail out if we need a copy to - // preserve this value. - SDValue N0 = Node->getOperand(0); - if (!N0.hasOneUse()) + // If Zbs is enabled and it is a single bit set we can use BSETI which + // can be compressed to C_BSETI when Xqcibm in enabled. + if (Width == 1 && Subtarget->hasStdExtZbs()) return false; // If C1 is a shifted mask (but can't be formed as an ORI), // use a bitfield insert of -1. // Transform (or x, C1) // -> (qc.insbi x, -1, width, shift) - const unsigned Leading = llvm::countl_zero((uint32_t)C1); - const unsigned Trailing = llvm::countr_zero((uint32_t)C1); - const unsigned Width = 32 - Leading - Trailing; + SDLoc DL(Node); + MVT VT = Node->getSimpleValueType(0); - // If Zbs is enabled and it is a single bit set we can use BSETI which - // can be compressed to C_BSETI when Xqcibm in enabled. - if (Width == 1 && Subtarget->hasStdExtZbs()) + SDValue Ops[] = {X, CurDAG->getSignedTargetConstant(-1, DL, VT), + CurDAG->getTargetConstant(Width, DL, VT), + CurDAG->getTargetConstant(ShAmt, DL, VT)}; + SDNode *BitIns = CurDAG->getMachineNode(RISCV::QC_INSBI, DL, VT, Ops); + ReplaceNode(Node, BitIns); + return true; +} + +// Generate a QC_INSB/QC_INSBI from 'or (and X, MaskImm), OrImm' iff the value +// being inserted only sets known zero bits. +bool RISCVDAGToDAGISel::tryBitfieldInsertOpFromOrAndImm(SDNode *Node) { + // Supported only in Xqcibm for now. + if (!Subtarget->hasVendorXqcibm()) + return false; + + using namespace SDPatternMatch; + + SDValue And; + APInt MaskImm, OrImm; + if (!sd_match(Node, m_Or(m_OneUse(m_And(m_Value(And), m_ConstInt(MaskImm))), + m_ConstInt(OrImm)))) + return false; + + // Compute the Known Zero for the AND as this allows us to catch more general + // cases than just looking for AND with imm. + KnownBits Known = CurDAG->computeKnownBits(Node->getOperand(0)); + + // The bits being inserted must only set those bits that are known to be zero. + if (!OrImm.isSubsetOf(Known.Zero)) { + // FIXME: It's okay if the OrImm sets NotKnownZero bits to 1, but we don't + // currently handle this case. + return false; + } + + unsigned ShAmt, Width; + // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00). + if (!Known.Zero.isShiftedMask(ShAmt, Width)) return false; + // QC_INSB(I) dst, src, #width, #shamt. SDLoc DL(Node); MVT VT = Node->getSimpleValueType(0); + SDValue ImmNode; + auto Opc = RISCV::QC_INSB; - SDValue Ops[] = {N0, CurDAG->getSignedTargetConstant(-1, DL, VT), - CurDAG->getTargetConstant(Width, DL, VT), - CurDAG->getTargetConstant(Trailing, DL, VT)}; - SDNode *BitIns = CurDAG->getMachineNode(RISCV::QC_INSBI, DL, VT, Ops); + int32_t LIImm = OrImm.getSExtValue() >> ShAmt; + + if (isInt<5>(LIImm)) { + Opc = RISCV::QC_INSBI; + ImmNode = CurDAG->getSignedTargetConstant(LIImm, DL, MVT::i32); + } else { + ImmNode = selectImm(CurDAG, DL, MVT::i32, LIImm, *Subtarget); + } + + SDValue Ops[] = {And, ImmNode, CurDAG->getTargetConstant(Width, DL, VT), + CurDAG->getTargetConstant(ShAmt, DL, VT)}; + SDNode *BitIns = CurDAG->getMachineNode(Opc, DL, VT, Ops); ReplaceNode(Node, BitIns); return true; } @@ -772,6 +819,49 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) { return false; } +// (xor X, (and (xor X, C1), C2)) +// -> (qc.insbi X, (C1 >> ShAmt), Width, ShAmt) +// where C2 is a shifted mask with width=Width and shift=ShAmt +bool RISCVDAGToDAGISel::tryBitfieldInsertOpFromXor(SDNode *Node) { + + if (!Subtarget->hasVendorXqcibm()) + return false; + + using namespace SDPatternMatch; + + SDValue X; + APInt CImm, CMask; + if (!sd_match( + Node, + m_Xor(m_Value(X), + m_OneUse(m_And(m_OneUse(m_Xor(m_Deferred(X), m_ConstInt(CImm))), + m_ConstInt(CMask)))))) + return false; + + unsigned Width, ShAmt; + if (!CMask.isShiftedMask(ShAmt, Width)) + return false; + + int64_t Imm = CImm.getSExtValue(); + Imm >>= ShAmt; + + SDLoc DL(Node); + SDValue ImmNode; + auto Opc = RISCV::QC_INSB; + + if (isInt<5>(Imm)) { + Opc = RISCV::QC_INSBI; + ImmNode = CurDAG->getSignedTargetConstant(Imm, DL, MVT::i32); + } else { + ImmNode = selectImm(CurDAG, DL, MVT::i32, Imm, *Subtarget); + } + SDValue Ops[] = {X, ImmNode, CurDAG->getTargetConstant(Width, DL, MVT::i32), + CurDAG->getTargetConstant(ShAmt, DL, MVT::i32)}; + ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, MVT::i32, Ops)); + + return true; +} + bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT, SDValue X, unsigned Msb, @@ -1340,6 +1430,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (trySignedBitfieldInsertInMask(Node)) return; + if (tryBitfieldInsertOpFromOrAndImm(Node)) + return; + if (tryShrinkShlLogicImm(Node)) return; @@ -1349,6 +1442,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (tryShrinkShlLogicImm(Node)) return; + if (tryBitfieldInsertOpFromXor(Node)) + return; + break; case ISD::AND: { auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1)); @@ -1644,7 +1740,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // available. // Transform (and x, C1) // -> (<bfextract> x, msb, lsb) - if (isMask_64(C1) && !isInt<12>(N1C->getSExtValue())) { + if (isMask_64(C1) && !isInt<12>(N1C->getSExtValue()) && + !(C1 == 0xffff && Subtarget->hasStdExtZbb()) && + !(C1 == 0xffffffff && Subtarget->hasStdExtZba())) { const unsigned Msb = llvm::bit_width(C1) - 1; if (tryUnsignedBitfieldExtract(Node, DL, VT, N0, Msb, 0)) return; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index ee3a86e..c329a4c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -75,6 +75,8 @@ public: bool trySignedBitfieldExtract(SDNode *Node); bool trySignedBitfieldInsertInSign(SDNode *Node); bool trySignedBitfieldInsertInMask(SDNode *Node); + bool tryBitfieldInsertOpFromXor(SDNode *Node); + bool tryBitfieldInsertOpFromOrAndImm(SDNode *Node); bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT, SDValue X, unsigned Msb, unsigned Lsb); bool tryUnsignedBitfieldInsertInZero(SDNode *Node, const SDLoc &DL, MVT VT, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e4aa8b8..4a1db80 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1844,6 +1844,17 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3, /*IsStore*/ true, /*IsUnitStrided*/ false, /*UsePtrVal*/ true); + case Intrinsic::riscv_sseg2_store_mask: + case Intrinsic::riscv_sseg3_store_mask: + case Intrinsic::riscv_sseg4_store_mask: + case Intrinsic::riscv_sseg5_store_mask: + case Intrinsic::riscv_sseg6_store_mask: + case Intrinsic::riscv_sseg7_store_mask: + case Intrinsic::riscv_sseg8_store_mask: + // Operands are (vec, ..., vec, ptr, offset, mask, vl) + return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 4, + /*IsStore*/ true, + /*IsUnitStrided*/ false, /*UsePtrVal*/ true); case Intrinsic::riscv_vlm: return SetRVVLoadStoreInfo(/*PtrOp*/ 0, /*IsStore*/ false, @@ -2512,11 +2523,11 @@ static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS, } break; case ISD::SETUGT: - if (Subtarget.hasVendorXqcibi() && C != INT64_MAX && isInt<16>(C + 1) && - C != -1) { + if (Subtarget.hasVendorXqcibi() && C != INT64_MAX && isUInt<16>(C + 1)) { // We have a branch immediate instruction for SETUGE but not SETUGT. - // Convert X > C to X >= C + 1, if (C + 1) is a 16-bit signed immediate. - RHS = DAG.getSignedConstant(C + 1, DL, RHS.getValueType()); + // Convert X > C to X >= C + 1, if (C + 1) is a 16-bit unsigned + // immediate. + RHS = DAG.getConstant(C + 1, DL, RHS.getValueType()); CC = ISD::SETUGE; return; } @@ -8931,10 +8942,7 @@ SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N, // Prepare argument list to generate call. ArgListTy Args; - ArgListEntry Entry; - Entry.Node = Load; - Entry.Ty = CallTy; - Args.push_back(Entry); + Args.emplace_back(Load, CallTy); // Setup call to __tls_get_addr. TargetLowering::CallLoweringInfo CLI(DAG); @@ -11084,69 +11092,118 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return lowerVectorIntrinsicScalars(Op, DAG, Subtarget); } -SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, - SelectionDAG &DAG) const { - unsigned IntNo = Op.getConstantOperandVal(1); +static SDValue +lowerFixedVectorSegStoreIntrinsics(unsigned IntNo, SDValue Op, + const RISCVSubtarget &Subtarget, + SelectionDAG &DAG) { + bool IsStrided; switch (IntNo) { - default: - break; case Intrinsic::riscv_seg2_store_mask: case Intrinsic::riscv_seg3_store_mask: case Intrinsic::riscv_seg4_store_mask: case Intrinsic::riscv_seg5_store_mask: case Intrinsic::riscv_seg6_store_mask: case Intrinsic::riscv_seg7_store_mask: - case Intrinsic::riscv_seg8_store_mask: { - SDLoc DL(Op); - static const Intrinsic::ID VssegInts[] = { - Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, - Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, - Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, - Intrinsic::riscv_vsseg8_mask}; + case Intrinsic::riscv_seg8_store_mask: + IsStrided = false; + break; + case Intrinsic::riscv_sseg2_store_mask: + case Intrinsic::riscv_sseg3_store_mask: + case Intrinsic::riscv_sseg4_store_mask: + case Intrinsic::riscv_sseg5_store_mask: + case Intrinsic::riscv_sseg6_store_mask: + case Intrinsic::riscv_sseg7_store_mask: + case Intrinsic::riscv_sseg8_store_mask: + IsStrided = true; + break; + default: + llvm_unreachable("unexpected intrinsic ID"); + } - // Operands: (chain, int_id, vec*, ptr, mask, vl) - unsigned NF = Op->getNumOperands() - 5; - assert(NF >= 2 && NF <= 8 && "Unexpected seg number"); - MVT XLenVT = Subtarget.getXLenVT(); - MVT VT = Op->getOperand(2).getSimpleValueType(); - MVT ContainerVT = getContainerForFixedLengthVector(VT); - unsigned Sz = NF * ContainerVT.getVectorMinNumElements() * - ContainerVT.getScalarSizeInBits(); - EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF); + SDLoc DL(Op); + static const Intrinsic::ID VssegInts[] = { + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask}; + static const Intrinsic::ID VsssegInts[] = { + Intrinsic::riscv_vssseg2_mask, Intrinsic::riscv_vssseg3_mask, + Intrinsic::riscv_vssseg4_mask, Intrinsic::riscv_vssseg5_mask, + Intrinsic::riscv_vssseg6_mask, Intrinsic::riscv_vssseg7_mask, + Intrinsic::riscv_vssseg8_mask}; + + // Operands: (chain, int_id, vec*, ptr, mask, vl) or + // (chain, int_id, vec*, ptr, stride, mask, vl) + unsigned NF = Op->getNumOperands() - (IsStrided ? 6 : 5); + assert(NF >= 2 && NF <= 8 && "Unexpected seg number"); + MVT XLenVT = Subtarget.getXLenVT(); + MVT VT = Op->getOperand(2).getSimpleValueType(); + MVT ContainerVT = ::getContainerForFixedLengthVector(DAG, VT, Subtarget); + unsigned Sz = NF * ContainerVT.getVectorMinNumElements() * + ContainerVT.getScalarSizeInBits(); + EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF); - SDValue VL = Op.getOperand(Op.getNumOperands() - 1); - SDValue Mask = Op.getOperand(Op.getNumOperands() - 2); - MVT MaskVT = Mask.getSimpleValueType(); - MVT MaskContainerVT = - ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget); - Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget); + SDValue VL = Op.getOperand(Op.getNumOperands() - 1); + SDValue Mask = Op.getOperand(Op.getNumOperands() - 2); + MVT MaskVT = Mask.getSimpleValueType(); + MVT MaskContainerVT = + ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget); + Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget); - SDValue IntID = DAG.getTargetConstant(VssegInts[NF - 2], DL, XLenVT); - SDValue Ptr = Op->getOperand(NF + 2); + SDValue IntID = DAG.getTargetConstant( + IsStrided ? VsssegInts[NF - 2] : VssegInts[NF - 2], DL, XLenVT); + SDValue Ptr = Op->getOperand(NF + 2); - auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op); + auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op); - SDValue StoredVal = DAG.getUNDEF(VecTupTy); - for (unsigned i = 0; i < NF; i++) - StoredVal = DAG.getNode( - RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal, - convertToScalableVector( - ContainerVT, FixedIntrinsic->getOperand(2 + i), DAG, Subtarget), - DAG.getTargetConstant(i, DL, MVT::i32)); + SDValue StoredVal = DAG.getUNDEF(VecTupTy); + for (unsigned i = 0; i < NF; i++) + StoredVal = DAG.getNode( + RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal, + convertToScalableVector(ContainerVT, FixedIntrinsic->getOperand(2 + i), + DAG, Subtarget), + DAG.getTargetConstant(i, DL, MVT::i32)); + + SmallVector<SDValue, 10> Ops = { + FixedIntrinsic->getChain(), + IntID, + StoredVal, + Ptr, + Mask, + VL, + DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)}; + // Insert the stride operand. + if (IsStrided) + Ops.insert(std::next(Ops.begin(), 4), + Op.getOperand(Op.getNumOperands() - 3)); + + return DAG.getMemIntrinsicNode( + ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops, + FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand()); +} + +SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntNo = Op.getConstantOperandVal(1); + switch (IntNo) { + default: + break; + case Intrinsic::riscv_seg2_store_mask: + case Intrinsic::riscv_seg3_store_mask: + case Intrinsic::riscv_seg4_store_mask: + case Intrinsic::riscv_seg5_store_mask: + case Intrinsic::riscv_seg6_store_mask: + case Intrinsic::riscv_seg7_store_mask: + case Intrinsic::riscv_seg8_store_mask: + case Intrinsic::riscv_sseg2_store_mask: + case Intrinsic::riscv_sseg3_store_mask: + case Intrinsic::riscv_sseg4_store_mask: + case Intrinsic::riscv_sseg5_store_mask: + case Intrinsic::riscv_sseg6_store_mask: + case Intrinsic::riscv_sseg7_store_mask: + case Intrinsic::riscv_sseg8_store_mask: + return lowerFixedVectorSegStoreIntrinsics(IntNo, Op, Subtarget, DAG); - SDValue Ops[] = { - FixedIntrinsic->getChain(), - IntID, - StoredVal, - Ptr, - Mask, - VL, - DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)}; - - return DAG.getMemIntrinsicNode( - ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops, - FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand()); - } case Intrinsic::riscv_sf_vc_xv_se: return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_XV_SE); case Intrinsic::riscv_sf_vc_iv_se: @@ -14273,7 +14330,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, LC = RTLIB::getFPTOUINT(Op0.getValueType(), N->getValueType(0)); MakeLibCallOptions CallOptions; EVT OpVT = Op0.getValueType(); - CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true); + CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0)); SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); SDValue Result; std::tie(Result, Chain) = @@ -14308,7 +14365,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Op0.getValueType() == MVT::f64 ? RTLIB::LROUND_F64 : RTLIB::LROUND_F32; MakeLibCallOptions CallOptions; EVT OpVT = Op0.getValueType(); - CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64, true); + CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64); SDValue Result = makeLibCall(DAG, LC, MVT::i64, Op0, CallOptions, DL).first; Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Result); Results.push_back(Result); @@ -16531,8 +16588,10 @@ combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, // (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from // bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg // can become a sext.w instead of a shift pair. -static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG, +static SDValue performSETCCCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, const RISCVSubtarget &Subtarget) { + SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -16548,6 +16607,20 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG, combineVectorSizedSetCCEquality(VT, N0, N1, Cond, dl, DAG, Subtarget)) return V; + // (X & -4096) == 0 -> (X >> 12) == 0 if the AND constant can't use ANDI. + if (DCI.isAfterLegalizeDAG() && isNullConstant(N1) && + N0.getOpcode() == ISD::AND && N0.hasOneUse() && + isa<ConstantSDNode>(N0.getOperand(1))) { + const APInt &AndRHSC = + cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); + if (!isInt<12>(AndRHSC.getSExtValue()) && AndRHSC.isNegatedPowerOf2()) { + unsigned ShiftBits = AndRHSC.countr_zero(); + SDValue Shift = DAG.getNode(ISD::SRL, dl, VT, N0.getOperand(0), + DAG.getConstant(ShiftBits, dl, VT)); + return DAG.getSetCC(dl, VT, Shift, N1, Cond); + } + } + if (OpVT != MVT::i64 || !Subtarget.is64Bit()) return SDValue(); @@ -16582,27 +16655,39 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG, } static SDValue -performSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, +performSIGN_EXTEND_INREGCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const RISCVSubtarget &Subtarget) { + SelectionDAG &DAG = DCI.DAG; SDValue Src = N->getOperand(0); EVT VT = N->getValueType(0); EVT SrcVT = cast<VTSDNode>(N->getOperand(1))->getVT(); unsigned Opc = Src.getOpcode(); + SDLoc DL(N); // Fold (sext_inreg (fmv_x_anyexth X), i16) -> (fmv_x_signexth X) // Don't do this with Zhinx. We need to explicitly sign extend the GPR. if (Opc == RISCVISD::FMV_X_ANYEXTH && SrcVT.bitsGE(MVT::i16) && Subtarget.hasStdExtZfhmin()) - return DAG.getNode(RISCVISD::FMV_X_SIGNEXTH, SDLoc(N), VT, - Src.getOperand(0)); + return DAG.getNode(RISCVISD::FMV_X_SIGNEXTH, DL, VT, Src.getOperand(0)); // Fold (sext_inreg (shl X, Y), i32) -> (sllw X, Y) iff Y u< 32 if (Opc == ISD::SHL && Subtarget.is64Bit() && SrcVT == MVT::i32 && VT == MVT::i64 && !isa<ConstantSDNode>(Src.getOperand(1)) && DAG.computeKnownBits(Src.getOperand(1)).countMaxActiveBits() <= 5) - return DAG.getNode(RISCVISD::SLLW, SDLoc(N), VT, Src.getOperand(0), + return DAG.getNode(RISCVISD::SLLW, DL, VT, Src.getOperand(0), Src.getOperand(1)); + // Fold (sext_inreg (setcc), i1) -> (sub 0, (setcc)) + if (Opc == ISD::SETCC && SrcVT == MVT::i1 && DCI.isAfterLegalizeDAG()) + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src); + + // Fold (sext_inreg (xor (setcc), -1), i1) -> (add (setcc), -1) + if (Opc == ISD::XOR && SrcVT == MVT::i1 && + isAllOnesConstant(Src.getOperand(1)) && + Src.getOperand(0).getOpcode() == ISD::SETCC && DCI.isAfterLegalizeDAG()) + return DAG.getNode(ISD::ADD, DL, VT, Src.getOperand(0), + DAG.getAllOnesConstant(DL, VT)); + return SDValue(); } @@ -17461,7 +17546,7 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N, return SDValue(); SmallVector<SDNode *> Worklist; - SmallSet<SDNode *, 8> Inserted; + SmallPtrSet<SDNode *, 8> Inserted; Worklist.push_back(N); Inserted.insert(N); SmallVector<CombineResult> CombinesToApply; @@ -20022,9 +20107,9 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } case ISD::SETCC: - return performSETCCCombine(N, DAG, Subtarget); + return performSETCCCombine(N, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: - return performSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); + return performSIGN_EXTEND_INREGCombine(N, DCI, Subtarget); case ISD::ZERO_EXTEND: // Fold (zero_extend (fp_to_uint X)) to prevent forming fcvt+zexti32 during // type legalization. This is safe because fp_to_uint produces poison if @@ -20580,10 +20665,11 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, // Combine store of vmv.x.s/vfmv.f.s to vse with VL of 1. // vfmv.f.s is represented as extract element from 0. Match it late to avoid // any illegal types. - if (Val.getOpcode() == RISCVISD::VMV_X_S || - (DCI.isAfterLegalizeDAG() && - Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT && - isNullConstant(Val.getOperand(1)))) { + if ((Val.getOpcode() == RISCVISD::VMV_X_S || + (DCI.isAfterLegalizeDAG() && + Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isNullConstant(Val.getOperand(1)))) && + Val.hasOneUse()) { SDValue Src = Val.getOperand(0); MVT VecVT = Src.getSimpleValueType(); // VecVT should be scalable and memory VT should match the element type. @@ -20673,12 +20759,22 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, isNullConstant(Src.getOperand(1)) && Src.getOperand(0).getValueType().isScalableVector()) { EVT VT = N->getValueType(0); - EVT SrcVT = Src.getOperand(0).getValueType(); - assert(SrcVT.getVectorElementType() == VT.getVectorElementType()); + SDValue EVSrc = Src.getOperand(0); + EVT EVSrcVT = EVSrc.getValueType(); + assert(EVSrcVT.getVectorElementType() == VT.getVectorElementType()); // Widths match, just return the original vector. - if (SrcVT == VT) - return Src.getOperand(0); - // TODO: Use insert_subvector/extract_subvector to change widen/narrow? + if (EVSrcVT == VT) + return EVSrc; + SDLoc DL(N); + // Width is narrower, using insert_subvector. + if (EVSrcVT.getVectorMinNumElements() < VT.getVectorMinNumElements()) { + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), + EVSrc, + DAG.getConstant(0, DL, Subtarget.getXLenVT())); + } + // Width is wider, using extract_subvector. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, EVSrc, + DAG.getConstant(0, DL, Subtarget.getXLenVT())); } [[fallthrough]]; } @@ -22270,20 +22366,12 @@ void RISCVTargetLowering::analyzeInputArgs( MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet, RISCVCCAssignFn Fn) const { - FunctionType *FType = MF.getFunction().getFunctionType(); - for (const auto &[Idx, In] : enumerate(Ins)) { MVT ArgVT = In.VT; ISD::ArgFlagsTy ArgFlags = In.Flags; - Type *ArgTy = nullptr; - if (IsRet) - ArgTy = FType->getReturnType(); - else if (In.isOrigArg()) - ArgTy = FType->getParamType(In.getOrigArgIndex()); - if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet, - ArgTy)) { + In.OrigTy)) { LLVM_DEBUG(dbgs() << "InputArg #" << Idx << " has unhandled type " << ArgVT << '\n'); llvm_unreachable(nullptr); @@ -22298,10 +22386,9 @@ void RISCVTargetLowering::analyzeOutputArgs( for (const auto &[Idx, Out] : enumerate(Outs)) { MVT ArgVT = Out.VT; ISD::ArgFlagsTy ArgFlags = Out.Flags; - Type *OrigTy = CLI ? CLI->getArgs()[Out.OrigArgIndex].Ty : nullptr; if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet, - OrigTy)) { + Out.OrigTy)) { LLVM_DEBUG(dbgs() << "OutputArg #" << Idx << " has unhandled type " << ArgVT << "\n"); llvm_unreachable(nullptr); @@ -23083,7 +23170,7 @@ bool RISCVTargetLowering::CanLowerReturn( MVT VT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; if (CC_RISCV(i, VT, VT, CCValAssign::Full, ArgFlags, CCInfo, - /*IsRet=*/true, nullptr)) + /*IsRet=*/true, Outs[i].OrigTy)) return false; } return true; @@ -23343,6 +23430,12 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, &RISCV::VRN2M4RegClass}) { if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy)) return std::make_pair(0U, RC); + + if (VT.isFixedLengthVector() && useRVVForFixedLengthVectorVT(VT)) { + MVT ContainerVT = getContainerForFixedLengthVector(VT); + if (TRI->isTypeLegalForClass(*RC, ContainerVT)) + return std::make_pair(0U, RC); + } } } else if (Constraint == "vd") { for (const auto *RC : @@ -23356,10 +23449,24 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, &RISCV::VRN2M4NoV0RegClass}) { if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy)) return std::make_pair(0U, RC); + + if (VT.isFixedLengthVector() && useRVVForFixedLengthVectorVT(VT)) { + MVT ContainerVT = getContainerForFixedLengthVector(VT); + if (TRI->isTypeLegalForClass(*RC, ContainerVT)) + return std::make_pair(0U, RC); + } } } else if (Constraint == "vm") { if (TRI->isTypeLegalForClass(RISCV::VMV0RegClass, VT.SimpleTy)) return std::make_pair(0U, &RISCV::VMV0RegClass); + + if (VT.isFixedLengthVector() && useRVVForFixedLengthVectorVT(VT)) { + MVT ContainerVT = getContainerForFixedLengthVector(VT); + // VT here might be coerced to vector with i8 elements, so we need to + // check if this is a M1 register here instead of checking VMV0RegClass. + if (TRI->isTypeLegalForClass(RISCV::VRRegClass, ContainerVT)) + return std::make_pair(0U, &RISCV::VMV0RegClass); + } } else if (Constraint == "cr") { if (VT == MVT::f16 && Subtarget.hasStdExtZhinxmin()) return std::make_pair(0U, &RISCV::GPRF16CRegClass); @@ -24237,7 +24344,12 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts( return true; } - if (ValueVT.isScalableVector() && PartVT.isScalableVector()) { + if ((ValueVT.isScalableVector() || ValueVT.isFixedLengthVector()) && + PartVT.isScalableVector()) { + if (ValueVT.isFixedLengthVector()) { + ValueVT = getContainerForFixedLengthVector(ValueVT.getSimpleVT()); + Val = convertToScalableVector(ValueVT, Val, DAG, Subtarget); + } LLVMContext &Context = *DAG.getContext(); EVT ValueEltVT = ValueVT.getVectorElementType(); EVT PartEltVT = PartVT.getVectorElementType(); @@ -24307,12 +24419,17 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue( return Val; } - if (ValueVT.isScalableVector() && PartVT.isScalableVector()) { + if ((ValueVT.isScalableVector() || ValueVT.isFixedLengthVector()) && + PartVT.isScalableVector()) { LLVMContext &Context = *DAG.getContext(); SDValue Val = Parts[0]; EVT ValueEltVT = ValueVT.getVectorElementType(); EVT PartEltVT = PartVT.getVectorElementType(); unsigned ValueVTBitSize = ValueVT.getSizeInBits().getKnownMinValue(); + if (ValueVT.isFixedLengthVector()) + ValueVTBitSize = getContainerForFixedLengthVector(ValueVT.getSimpleVT()) + .getSizeInBits() + .getKnownMinValue(); unsigned PartVTBitSize = PartVT.getSizeInBits().getKnownMinValue(); if (PartVTBitSize % ValueVTBitSize == 0) { assert(PartVTBitSize >= ValueVTBitSize); @@ -24330,7 +24447,10 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue( EVT::getVectorVT(Context, ValueEltVT, Count, /*IsScalable=*/true); Val = DAG.getNode(ISD::BITCAST, DL, SameEltTypeVT, Val); } - Val = DAG.getExtractSubvector(DL, ValueVT, Val, 0); + if (ValueVT.isFixedLengthVector()) + Val = convertFromScalableVector(ValueVT, Val, DAG, Subtarget); + else + Val = DAG.getExtractSubvector(DL, ValueVT, Val, 0); return Val; } } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 433b8be..fb63ebc 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -431,8 +431,8 @@ public: bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, - ArrayRef<unsigned> Indices, - unsigned Factor) const override; + ArrayRef<unsigned> Indices, unsigned Factor, + const APInt &GapMask) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td index d9c6101..878a0ec 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td @@ -261,6 +261,12 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr, // Indicates the EEW of a vector instruction's destination operand. EEW DestEEW = EEWSEWx1; let TSFlags{25-24} = DestEEW.Value; + + // Some vector instructions like vslidedown/vrgather will read elements past + // VL, and should be marked to make sure RISCVVLOptimizer doesn't reduce its + // operands' VLs. + bit ReadsPastVL = 0; + let TSFlags{26} = ReadsPastVL; } class RVInst<dag outs, dag ins, string opcodestr, string argstr, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 8bd3830..836a2b1 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1694,6 +1694,16 @@ multiclass SelectCC_GPR_riirr<DAGOperand valty, DAGOperand imm> { valty:$truev, valty:$falsev), []>; } +let Predicates = [IsRV32] in { +def : Pat<(i32 (setlt (i32 GPR:$rs1), 0)), (SRLI GPR:$rs1, 31)>; // compressible +} +let Predicates = [IsRV64] in { +def : Pat<(i64 (seteq (i64 (and GPR:$rs1, 0x0000000080000000)), 0)), + (XORI (i64 (SRLIW GPR:$rs1, 31)), 1)>; +def : Pat<(i64 (setlt (i64 GPR:$rs1), 0)), (SRLI GPR:$rs1, 63)>; // compressible +def : Pat<(i64 (setlt (sext_inreg GPR:$rs1, i32), 0)), (SRLIW GPR:$rs1, 31)>; +} + /// Branches and jumps // Match `riscv_brcc` and lower to the appropriate RISC-V branch instruction. @@ -2367,6 +2377,7 @@ include "RISCVInstrInfoXqccmp.td" include "RISCVInstrInfoXMips.td" include "RISCVInstrInfoXRivos.td" include "RISCVInstrInfoXAndes.td" +include "RISCVInstrInfoXSpacemiT.td" //===----------------------------------------------------------------------===// // Global ISel diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index 8297d50..1e22c2d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -18,7 +18,26 @@ // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// -def simm10 : RISCVSImmLeafOp<10>; +def simm10 : RISCVSImmOp<10>; + +def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> { + let RenderMethod = "addSImm8UnsignedOperands"; +} + +// A 8-bit signed immediate allowing range [-128, 255] +// but represented as [-128, 255]. +def simm8_unsigned : RISCVOp { + let ParserMatchClass = SImm8UnsignedAsmOperand; + let EncoderMethod = "getImmOpValue"; + let DecoderMethod = "decodeSImmOperand<8>"; + let OperandType = "OPERAND_SIMM10"; + let MCOperandPredicate = [{ + int64_t Imm; + if (!MCOp.evaluateAsConstantImm(Imm)) + return false; + return isInt<8>(Imm); + }]; +} def SImm10UnsignedAsmOperand : SImmAsmOperand<10, "Unsigned"> { let RenderMethod = "addSImm10UnsignedOperands"; @@ -43,49 +62,40 @@ def simm10_unsigned : RISCVOp { // Instruction class templates //===----------------------------------------------------------------------===// -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class PLI_i<bits<7> funct7, string opcodestr> - : RVInst<(outs GPR:$rd), (ins simm10:$imm10), opcodestr, "$rd, $imm10", [], +// Common base for pli.b/h/w and plui.h/w +class RVPLoadImm_i<bits<7> funct7, dag ins, string opcodestr, + string argstr> + : RVInst<(outs GPR:$rd), ins, opcodestr, argstr, [], InstFormatOther> { - bits<10> imm10; bits<5> rd; let Inst{31-25} = funct7; - let Inst{24-16} = imm10{8-0}; - let Inst{15} = imm10{9}; let Inst{14-12} = 0b010; let Inst{11-7} = rd; let Inst{6-0} = OPC_OP_IMM_32.Value; + + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; } -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class PLUI_i<bits<7> funct7, string opcodestr> - : RVInst<(outs GPR:$rd), (ins simm10_unsigned:$imm10), opcodestr, - "$rd, $imm10", [], InstFormatOther> { +// Base for pli.h/w. +class PLI_i<bits<7> funct7, string opcodestr> + : RVPLoadImm_i<funct7, (ins simm10:$imm10), opcodestr, "$rd, $imm10"> { bits<10> imm10; - bits<5> rd; - let Inst{31-25} = funct7; - let Inst{24} = imm10{0}; - let Inst{23-15} = imm10{9-1}; - let Inst{14-12} = 0b010; - let Inst{11-7} = rd; - let Inst{6-0} = OPC_OP_IMM_32.Value; + let Inst{24-16} = imm10{8-0}; + let Inst{15} = imm10{9}; } -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class PLI_B_i<bits<8> funct8, string opcodestr> - : RVInst<(outs GPR:$rd), (ins uimm8:$uimm8), opcodestr, "$rd, $uimm8", [], - InstFormatOther> { - bits<8> uimm8; - bits<5> rd; +// Base for plui.h/w. +class PLUI_i<bits<7> funct7, string opcodestr> + : RVPLoadImm_i<funct7, (ins simm10_unsigned:$imm10), opcodestr, + "$rd, $imm10"> { + bits<10> imm10; - let Inst{31-24} = funct8; - let Inst{23-16} = uimm8; - let Inst{15} = 0b0; - let Inst{14-12} = 0b010; - let Inst{11-7} = rd; - let Inst{6-0} = OPC_OP_IMM_32.Value; + let Inst{24} = imm10{0}; + let Inst{23-15} = imm10{9-1}; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in @@ -98,6 +108,14 @@ class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType> let Inst{27} = 0b0; } +class RVPShiftD_ri<bits<3> f, bits<3> funct3, string opcodestr> + : RVPShift_ri<f, funct3, opcodestr, uimm6> { + bits<6> shamt; + + let Inst{26} = 0b1; + let Inst{25-20} = shamt; +} + class RVPShiftW_ri<bits<3> f, bits<3> funct3, string opcodestr> : RVPShift_ri<f, funct3, opcodestr, uimm5> { bits<5> shamt; @@ -131,59 +149,477 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr> let Inst{24-20} = uf; } +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class RVPBinaryScalar_rr<bits<3> f, bits<2> w, bits<3> funct3, string opcodestr> + : RVInstRBase<funct3, OPC_OP_IMM_32, (outs GPR:$rd), + (ins GPR:$rs1, GPR:$rs2), opcodestr, "$rd, $rs1, $rs2"> { + let Inst{31} = 0b1; + let Inst{30-28} = f; + let Inst{27} = 0b1; + let Inst{26-25} = w; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class RVPBinary_rr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr> + : RVInstRBase<funct3, OPC_OP_32, (outs GPR:$rd), + (ins GPR:$rs1, GPR:$rs2), opcodestr, "$rd, $rs1, $rs2"> { + let Inst{31} = 0b1; + let Inst{30-27} = f; + let Inst{26-25} = w; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class RVPTernary_rrr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr> + : RVInstRBase<funct3, OPC_OP_32, (outs GPR:$rd_wb), + (ins GPR:$rd, GPR:$rs1, GPR:$rs2), opcodestr, + "$rd, $rs1, $rs2"> { + let Inst{31} = 0b1; + let Inst{30-27} = f; + let Inst{26-25} = w; + + let Constraints = "$rd = $rd_wb"; +} + +// Common base for pli.db/h/w and plui.dh/w +class RVPPairLoadImm_i<bits<7> funct7, dag ins, string opcodestr, + string argstr> + : RVInst<(outs GPRPairRV32:$rd), ins, opcodestr, argstr, [], + InstFormatOther> { + bits<5> rd; + + let Inst{31-25} = funct7; + let Inst{14-12} = 0b010; + let Inst{11-8} = rd{4-1}; + let Inst{7} = 0b0; + let Inst{6-0} = OPC_OP_IMM_32.Value; + + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// let Predicates = [HasStdExtP] in { -let IsSignExtendingOpW = 1 in -def CLS : Unary_r<0b011000000011, 0b001, "cls">; -def ABS : Unary_r<0b011000000111, 0b001, "abs">; + let IsSignExtendingOpW = 1 in + def CLS : Unary_r<0b011000000011, 0b001, "cls">; + def ABS : Unary_r<0b011000000111, 0b001, "abs">; } // Predicates = [HasStdExtP] -let Predicates = [HasStdExtP, IsRV32] in -def REV_RV32 : Unary_r<0b011010011111, 0b101, "rev">; + +let Predicates = [HasStdExtP, IsRV32] in { + def REV_RV32 : Unary_r<0b011010011111, 0b101, "rev">; +} // Predicates = [HasStdExtP, IsRV32] let Predicates = [HasStdExtP, IsRV64] in { -def REV16 : Unary_r<0b011010110000, 0b101, "rev16">; -def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">; + def REV16 : Unary_r<0b011010110000, 0b101, "rev16">; + def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">; -let IsSignExtendingOpW = 1 in { -def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">; -def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">; -} + let IsSignExtendingOpW = 1 in { + def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">; + def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">; + } } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in { -def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">; -def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">; -def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">; + def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">; + def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">; + def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">; } // Predicates = [HasStdExtP] -let DecoderNamespace = "RV32Only", - Predicates = [HasStdExtP, IsRV32] in -def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">; +let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in { + def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">; +} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" let Predicates = [HasStdExtP, IsRV64] in { -def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">; -def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">; + def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">; + def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">; } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in def PLI_H : PLI_i<0b1011000, "pli.h">; let Predicates = [HasStdExtP, IsRV64] in def PLI_W : PLI_i<0b1011001, "pli.w">; -let Predicates = [HasStdExtP] in -def PLI_B : PLI_B_i<0b10110100, "pli.b">; +let Predicates = [HasStdExtP] in { + def PLI_B : RVPLoadImm_i<0b1011010, (ins simm8_unsigned:$imm8), "pli.b", + "$rd, $imm8"> { + bits<8> imm8; + + let Inst{24} = 0b0; + let Inst{23-16} = imm8; + let Inst{15} = 0b0; + } +} let Predicates = [HasStdExtP] in { -def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">; -def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">; -def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">; + def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">; + def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">; + def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">; } // Predicates = [HasStdExtP] let Predicates = [HasStdExtP, IsRV64] in { -def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">; -def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">; + def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">; + def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">; } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in def PLUI_H : PLUI_i<0b1111000, "plui.h">; let Predicates = [HasStdExtP, IsRV64] in def PLUI_W : PLUI_i<0b1111001, "plui.w">; + +let Predicates = [HasStdExtP] in { + def PSLL_HS : RVPBinaryScalar_rr<0b000, 0b00, 0b010, "psll.hs">; + def PSLL_BS : RVPBinaryScalar_rr<0b000, 0b10, 0b010, "psll.bs">; + + def PADD_HS : RVPBinaryScalar_rr<0b001, 0b00, 0b010, "padd.hs">; + def PADD_BS : RVPBinaryScalar_rr<0b001, 0b10, 0b010, "padd.bs">; + + def PSSHA_HS : RVPBinaryScalar_rr<0b110, 0b00, 0b010, "pssha.hs">; + + def PSSHAR_HS : RVPBinaryScalar_rr<0b111, 0b00, 0b010, "psshar.hs">; +} // Predicates = [HasStdExtP] +let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in { + def SSHA : RVPBinaryScalar_rr<0b110, 0b01, 0b010, "ssha">; + + def SSHAR : RVPBinaryScalar_rr<0b111, 0b01, 0b010, "sshar">; +} // Predicates = [HasStdExtP, IsRV32] +let Predicates = [HasStdExtP, IsRV64] in { + def PSLL_WS : RVPBinaryScalar_rr<0b000, 0b01, 0b010, "psll.ws">; + + def PADD_WS : RVPBinaryScalar_rr<0b001, 0b01, 0b010, "padd.ws">; + + def PSSHA_WS : RVPBinaryScalar_rr<0b110, 0b01, 0b010, "pssha.ws">; + def SHA : RVPBinaryScalar_rr<0b110, 0b11, 0b010, "sha">; + + def PSSHAR_WS : RVPBinaryScalar_rr<0b111, 0b01, 0b010, "psshar.ws">; + def SHAR : RVPBinaryScalar_rr<0b111, 0b11, 0b010, "shar">; +} // Predicates = [HasStdExtP, IsRV64] + +let Predicates = [HasStdExtP] in { + def PSRLI_B : RVPShiftB_ri<0b000, 0b100, "psrli.b">; + def PSRLI_H : RVPShiftH_ri<0b000, 0b100, "psrli.h">; + + def PUSATI_H : RVPShiftH_ri<0b010, 0b100, "pusati.h">; + + def PSRAI_B : RVPShiftB_ri<0b100, 0b100, "psrai.b">; + def PSRAI_H : RVPShiftH_ri<0b100, 0b100, "psrai.h">; + + def PSRARI_H : RVPShiftH_ri<0b101, 0b100, "psrari.h">; + + def PSATI_H : RVPShiftH_ri<0b110, 0b100, "psati.h">; +} // Predicates = [HasStdExtP] +let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in { + def USATI_RV32 : RVPShiftW_ri<0b010, 0b100, "usati">; + + def SRARI_RV32 : RVPShiftW_ri<0b101, 0b100, "srari">; + + def SATI_RV32 : RVPShiftW_ri<0b110, 0b100, "sati">; +} // Predicates = [HasStdExtP, IsRV32] +let Predicates = [HasStdExtP, IsRV64] in { + def PSRLI_W : RVPShiftW_ri<0b000, 0b100, "psrli.w">; + def PSRAI_W : RVPShiftW_ri<0b100, 0b100, "psrai.w">; + + def PUSATI_W : RVPShiftW_ri<0b010, 0b100, "pusati.w">; + def USATI_RV64 : RVPShiftD_ri<0b010, 0b100, "usati">; + + def PSRARI_W : RVPShiftW_ri<0b101, 0b100, "psrari.w">; + def SRARI_RV64 : RVPShiftD_ri<0b101, 0b100, "srari">; + + def PSATI_W : RVPShiftW_ri<0b110, 0b100, "psati.w">; + def SATI_RV64 : RVPShiftD_ri<0b110, 0b100, "sati">; +} // Predicates = [HasStdExtP, IsRV64] + +let Predicates = [HasStdExtP] in { + def PSRL_HS : RVPBinaryScalar_rr<0b000, 0b00, 0b100, "psrl.hs">; + def PSRL_BS : RVPBinaryScalar_rr<0b000, 0b10, 0b100, "psrl.bs">; + + def PREDSUM_HS : RVPBinaryScalar_rr<0b001, 0b00, 0b100, "predsum.hs">; + def PREDSUM_BS : RVPBinaryScalar_rr<0b001, 0b10, 0b100, "predsum.bs">; + + def PREDSUMU_HS : RVPBinaryScalar_rr<0b011, 0b00, 0b100, "predsumu.hs">; + def PREDSUMU_BS : RVPBinaryScalar_rr<0b011, 0b10, 0b100, "predsumu.bs">; + + def PSRA_HS : RVPBinaryScalar_rr<0b100, 0b00, 0b100, "psra.hs">; + def PSRA_BS : RVPBinaryScalar_rr<0b100, 0b10, 0b100, "psra.bs">; +} // Predicates = [HasStdExtP] +let Predicates = [HasStdExtP, IsRV64] in { + def PSRL_WS : RVPBinaryScalar_rr<0b000, 0b01, 0b100, "psrl.ws">; + + def PREDSUM_WS : RVPBinaryScalar_rr<0b001, 0b01, 0b100, "predsum.ws">; + + def PREDSUMU_WS : RVPBinaryScalar_rr<0b011, 0b01, 0b100, "predsumu.ws">; + + def PSRA_WS : RVPBinaryScalar_rr<0b100, 0b01, 0b100, "psra.ws">; +} // Predicates = [HasStdExtP, IsRV64] + +let Predicates = [HasStdExtP] in { + def PADD_H : RVPBinary_rr<0b0000, 0b00, 0b000, "padd.h">; + def PADD_B : RVPBinary_rr<0b0000, 0b10, 0b000, "padd.b">; + + def PSADD_H : RVPBinary_rr<0b0010, 0b00, 0b000, "psadd.h">; + def PSADD_B : RVPBinary_rr<0b0010, 0b10, 0b000, "psadd.b">; + + def PAADD_H : RVPBinary_rr<0b0011, 0b00, 0b000, "paadd.h">; + def PAADD_B : RVPBinary_rr<0b0011, 0b10, 0b000, "paadd.b">; + + def PSADDU_H : RVPBinary_rr<0b0110, 0b00, 0b000, "psaddu.h">; + def PSADDU_B : RVPBinary_rr<0b0110, 0b10, 0b000, "psaddu.b">; + + def PAADDU_H : RVPBinary_rr<0b0111, 0b00, 0b000, "paaddu.h">; + def PAADDU_B : RVPBinary_rr<0b0111, 0b10, 0b000, "paaddu.b">; + + def PSUB_H : RVPBinary_rr<0b1000, 0b00, 0b000, "psub.h">; + def PSUB_B : RVPBinary_rr<0b1000, 0b10, 0b000, "psub.b">; + + def PDIF_H : RVPBinary_rr<0b1001, 0b00, 0b000, "pdif.h">; + def PDIF_B : RVPBinary_rr<0b1001, 0b10, 0b000, "pdif.b">; + + def PSSUB_H : RVPBinary_rr<0b1010, 0b00, 0b000, "pssub.h">; + def PSSUB_B : RVPBinary_rr<0b1010, 0b10, 0b000, "pssub.b">; + + def PASUB_H : RVPBinary_rr<0b1011, 0b00, 0b000, "pasub.h">; + def PASUB_B : RVPBinary_rr<0b1011, 0b10, 0b000, "pasub.b">; + + def PDIFU_H : RVPBinary_rr<0b1101, 0b00, 0b000, "pdifu.h">; + def PDIFU_B : RVPBinary_rr<0b1101, 0b10, 0b000, "pdifu.b">; + + def PSSUBU_H : RVPBinary_rr<0b1110, 0b00, 0b000, "pssubu.h">; + def PSSUBU_B : RVPBinary_rr<0b1110, 0b10, 0b000, "pssubu.b">; + + def PASUBU_H : RVPBinary_rr<0b1111, 0b00, 0b000, "pasubu.h">; + def PASUBU_B : RVPBinary_rr<0b1111, 0b10, 0b000, "pasubu.b">; +} // Predicates = [HasStdExtP] +let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in { + def SADD : RVPBinary_rr<0b0010, 0b01, 0b000, "sadd">; + + def AADD : RVPBinary_rr<0b0011, 0b01, 0b000, "aadd">; + + def SADDU : RVPBinary_rr<0b0110, 0b01, 0b000, "saddu">; + + def AADDU : RVPBinary_rr<0b0111, 0b01, 0b000, "aaddu">; + + def SSUB : RVPBinary_rr<0b1010, 0b01, 0b000, "ssub">; + + def ASUB : RVPBinary_rr<0b1011, 0b01, 0b000, "asub">; + + def SSUBU : RVPBinary_rr<0b1110, 0b01, 0b000, "ssubu">; + + def ASUBU : RVPBinary_rr<0b1111, 0b01, 0b000, "asubu">; +} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" +let Predicates = [HasStdExtP, IsRV64] in { + def PADD_W : RVPBinary_rr<0b0000, 0b01, 0b000, "padd.w">; + + def PSADD_W : RVPBinary_rr<0b0010, 0b01, 0b000, "psadd.w">; + + def PAADD_W : RVPBinary_rr<0b0011, 0b01, 0b000, "paadd.w">; + + def PSADDU_W : RVPBinary_rr<0b0110, 0b01, 0b000, "psaddu.w">; + + def PAADDU_W : RVPBinary_rr<0b0111, 0b01, 0b000, "paaddu.w">; + + def PSUB_W : RVPBinary_rr<0b1000, 0b01, 0b000, "psub.w">; + + def PSSUB_W : RVPBinary_rr<0b1010, 0b01, 0b000, "pssub.w">; + + def PASUB_W : RVPBinary_rr<0b1011, 0b01, 0b000, "pasub.w">; + + def PSSUBU_W : RVPBinary_rr<0b1110, 0b01, 0b000, "pssubu.w">; + + def PASUBU_W : RVPBinary_rr<0b1111, 0b01, 0b000, "pasubu.w">; +} // Predicates = [HasStdExtP, IsRV64] + +let Predicates = [HasStdExtP] in { + def SLX : RVPBinary_rr<0b0001, 0b11, 0b001, "slx">; + + def PMUL_H_B01 : RVPBinary_rr<0b0010, 0b00, 0b001, "pmul.h.b01">; + + def MVM : RVPTernary_rrr<0b0101, 0b00, 0b001, "mvm">; + def MVMN : RVPTernary_rrr<0b0101, 0b01, 0b001, "mvmn">; + def MERGE : RVPTernary_rrr<0b0101, 0b10, 0b001, "merge">; + def SRX : RVPTernary_rrr<0b0101, 0b11, 0b001, "srx">; + + def PMULU_H_B01 : RVPBinary_rr<0b0110, 0b00, 0b001, "pmulu.h.b01">; + def PDIFSUMU_B : RVPBinary_rr<0b0110, 0b10, 0b001, "pdifsumu.b">; + + def PDIFSUMAU_B : RVPTernary_rrr<0b0111, 0b10, 0b001, "pdifsumau.b">; +} // Predicates = [HasStdExtP] +let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in { + def MUL_H01 : RVPBinary_rr<0b0010, 0b01, 0b001, "mul.h01">; + + def MACC_H01 : RVPTernary_rrr<0b0011, 0b01, 0b001, "macc.h01">; + + def MULU_H01 : RVPBinary_rr<0b0110, 0b01, 0b001, "mulu.h01">; + + def MACCU_H01 : RVPTernary_rrr<0b0111, 0b01, 0b001, "maccu.h01">; +} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" +let Predicates = [HasStdExtP, IsRV64] in { + def PMUL_W_H01 : RVPBinary_rr<0b0010, 0b01, 0b001, "pmul.w.h01">; + def MUL_W01 : RVPBinary_rr<0b0010, 0b11, 0b001, "mul.w01">; + + def PMACC_W_H01 : RVPTernary_rrr<0b0011, 0b01, 0b001, "pmacc.w.h01">; + def MACC_W01 : RVPTernary_rrr<0b0011, 0b11, 0b001, "macc.w01">; + + def PMULU_W_H01 : RVPBinary_rr<0b0110, 0b01, 0b001, "pmulu.w.h01">; + def MULU_W01 : RVPBinary_rr<0b0110, 0b11, 0b001, "mulu.w01">; + + def PMACCU_W_H01 : RVPTernary_rrr<0b0111, 0b01, 0b001, "pmaccu.w.h01">; + def MACCU_W01 : RVPTernary_rrr<0b0111, 0b11, 0b001, "maccu.w01">; +} // Predicates = [HasStdExtP, IsRV64] + +// Note the spec has a 3-bit f field in bits 30:28 with 0 in bit 27. +// Here we include the 0 in the f field to reduce number of tablegen classes. +let Predicates = [HasStdExtP] in { + def PSH1ADD_H : RVPBinary_rr<0b0100, 0b00, 0b010, "psh1add.h">; + + def PSSH1SADD_H : RVPBinary_rr<0b0110, 0b00, 0b010, "pssh1sadd.h">; +} // Predicates = [HasStdExtP] +let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in { + def SSH1SADD : RVPBinary_rr<0b0110, 0b01, 0b010, "ssh1sadd">; +} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" +let Predicates = [HasStdExtP, IsRV64] in { + def PSH1ADD_W : RVPBinary_rr<0b0100, 0b01, 0b010, "psh1add.w">; + + def PSSH1SADD_W : RVPBinary_rr<0b0110, 0b01, 0b010, "pssh1sadd.w">; + + def UNZIP8P : RVPBinary_rr<0b1100, 0b00, 0b010, "unzip8p">; + def UNZIP16P : RVPBinary_rr<0b1100, 0b01, 0b010, "unzip16p">; + def UNZIP8HP : RVPBinary_rr<0b1100, 0b10, 0b010, "unzip8hp">; + def UNZIP16HP : RVPBinary_rr<0b1100, 0b11, 0b010, "unzip16hp">; + + def ZIP8P : RVPBinary_rr<0b1110, 0b00, 0b010, "zip8p">; + def ZIP16P : RVPBinary_rr<0b1110, 0b01, 0b010, "zip16p">; + def ZIP8HP : RVPBinary_rr<0b1110, 0b10, 0b010, "zip8hp">; + def ZIP16HP : RVPBinary_rr<0b1110, 0b11, 0b010, "zip16hp">; +} // Predicates = [HasStdExtP, IsRV64] + +let Predicates = [HasStdExtP] in { + def PMUL_H_B00 : RVPBinary_rr<0b0000, 0b00, 0b011, "pmul.h.b00">; + + def PMUL_H_B11 : RVPBinary_rr<0b0010, 0b00, 0b011, "pmul.h.b11">; + + def PMULU_H_B00 : RVPBinary_rr<0b0100, 0b00, 0b011, "pmulu.h.b00">; + + def PMULU_H_B11 : RVPBinary_rr<0b0110, 0b00, 0b011, "pmulu.h.b11">; + + def PMULSU_H_B00 : RVPBinary_rr<0b1100, 0b00, 0b011, "pmulsu.h.b00">; + + def PMULSU_H_B11 : RVPBinary_rr<0b1110, 0b00, 0b011, "pmulsu.h.b11">; +} // Predicates = [HasStdExtP] +let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in { + def MUL_H00 : RVPBinary_rr<0b0000, 0b01, 0b011, "mul.h00">; + + def MACC_H00 : RVPTernary_rrr<0b0001, 0b01, 0b011, "macc.h00">; + + def MUL_H11 : RVPBinary_rr<0b0010, 0b01, 0b011, "mul.h11">; + + def MACC_H11 : RVPTernary_rrr<0b0011, 0b01, 0b011, "macc.h11">; + + def MULU_H00 : RVPBinary_rr<0b0100, 0b01, 0b011, "mulu.h00">; + + def MACCU_H00 : RVPTernary_rrr<0b0101, 0b01, 0b011, "maccu.h00">; + + def MULU_H11 : RVPBinary_rr<0b0110, 0b01, 0b011, "mulu.h11">; + + def MACCU_H11 : RVPTernary_rrr<0b0111, 0b01, 0b011, "maccu.h11">; + + def MULSU_H00 : RVPBinary_rr<0b1100, 0b01, 0b011, "mulsu.h00">; + + def MACCSU_H00 : RVPTernary_rrr<0b1101, 0b01, 0b011, "maccsu.h00">; + + def MULSU_H11 : RVPBinary_rr<0b1110, 0b01, 0b011, "mulsu.h11">; + + def MACCSU_H11 : RVPTernary_rrr<0b1111, 0b01, 0b011, "maccsu.h11">; +} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" +let Predicates = [HasStdExtP, IsRV64] in { + def PMUL_W_H00 : RVPBinary_rr<0b0000, 0b01, 0b011, "pmul.w.h00">; + def MUL_W00 : RVPBinary_rr<0b0000, 0b11, 0b011, "mul.w00">; + + def PMACC_W_H00 : RVPTernary_rrr<0b0001, 0b01, 0b011, "pmacc.w.h00">; + def MACC_W00 : RVPTernary_rrr<0b0001, 0b11, 0b011, "macc.w00">; + + def PMUL_W_H11 : RVPBinary_rr<0b0010, 0b01, 0b011, "pmul.w.h11">; + def MUL_W11 : RVPBinary_rr<0b0010, 0b11, 0b011, "mul.w11">; + + def PMACC_W_H11 : RVPTernary_rrr<0b0011, 0b01, 0b011, "pmacc.w.h11">; + def MACC_W11 : RVPTernary_rrr<0b0011, 0b11, 0b011, "macc.w11">; + + def PMULU_W_H00 : RVPBinary_rr<0b0100, 0b01, 0b011, "pmulu.w.h00">; + def MULU_W00 : RVPBinary_rr<0b0100, 0b11, 0b011, "mulu.w00">; + + def PMACCU_W_H00 : RVPTernary_rrr<0b0101, 0b01, 0b011, "pmaccu.w.h00">; + def MACCU_W00 : RVPTernary_rrr<0b0101, 0b11, 0b011, "maccu.w00">; + + def PMULU_W_H11 : RVPBinary_rr<0b0110, 0b01, 0b011, "pmulu.w.h11">; + def MULU_W11 : RVPBinary_rr<0b0110, 0b11, 0b011, "mulu.w11">; + + def PMACCU_W_H11 : RVPTernary_rrr<0b0111, 0b01, 0b011, "pmaccu.w.h11">; + def MACCU_W11 : RVPTernary_rrr<0b0111, 0b11, 0b011, "maccu.w11">; + + def PMULSU_W_H00 : RVPBinary_rr<0b1100, 0b01, 0b011, "pmulsu.w.h00">; + def MULSU_W00 : RVPBinary_rr<0b1100, 0b11, 0b011, "mulsu.w00">; + + def PMACCSU_W_H00 : RVPTernary_rrr<0b1101, 0b01, 0b011, "pmaccsu.w.h00">; + def MACCSU_W00 : RVPTernary_rrr<0b1101, 0b11, 0b011, "maccsu.w00">; + + def PMULSU_W_H11 : RVPBinary_rr<0b1110, 0b01, 0b011, "pmulsu.w.h11">; + def MULSU_W11 : RVPBinary_rr<0b1110, 0b11, 0b011, "mulsu.w11">; + + def PMACCSU_W_H11 : RVPTernary_rrr<0b1111, 0b01, 0b011, "pmaccsu.w.h11">; + def MACCSU_W11 : RVPTernary_rrr<0b1111, 0b11, 0b011, "maccsu.w11">; +} // Predicates = [HasStdExtP, IsRV64] + +// Note the spec has a 3-bit f field in bits 30:28 with 0 in bit 27. +// Here we include the 0 in the f field to reduce number of tablegen classes. +let Predicates = [HasStdExtP] in { + def PPACK_H : RVPBinary_rr<0b0000, 0b00, 0b100, "ppack.h">; + + def PPACKBT_H : RVPBinary_rr<0b0010, 0b00, 0b100, "ppackbt.h">; + + def PPACKTB_H : RVPBinary_rr<0b0100, 0b00, 0b100, "ppacktb.h">; + + def PPACKT_H : RVPBinary_rr<0b0110, 0b00, 0b100, "ppackt.h">; +} // Predicates = [HasStdExtP] +let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in { + def PACKBT_RV32 : RVPBinary_rr<0b0010, 0b01, 0b100, "packbt">; + + def PACKTB_RV32 : RVPBinary_rr<0b0100, 0b01, 0b100, "packtb">; + + def PACKT_RV32 : RVPBinary_rr<0b0110, 0b01, 0b100, "packt">; +} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" +let Predicates = [HasStdExtP, IsRV64] in { + def PPACK_W : RVPBinary_rr<0b0000, 0b01, 0b100, "ppack.w">; + + def PPACKBT_W : RVPBinary_rr<0b0010, 0b01, 0b100, "ppackbt.w">; + def PACKBT_RV64 : RVPBinary_rr<0b0010, 0b11, 0b100, "packbt">; + + def PPACKTB_W : RVPBinary_rr<0b0100, 0b01, 0b100, "ppacktb.w">; + def PACKTB_RV64 : RVPBinary_rr<0b0100, 0b11, 0b100, "packtb">; + + def PPACKT_W : RVPBinary_rr<0b0110, 0b01, 0b100, "ppackt.w">; + def PACKT_RV64 : RVPBinary_rr<0b0110, 0b11, 0b100, "packt">; +} // Predicates = [HasStdExtP, IsRV64] + +let Predicates = [HasStdExtP, IsRV32] in { + def PLI_DH : RVPPairLoadImm_i<0b0011000, (ins simm10:$imm10), "pli.dh", + "$rd, $imm10"> { + bits<10> imm10; + + let Inst{24-16} = imm10{8-0}; + let Inst{15} = imm10{9}; + } + + def PLI_DB : RVPPairLoadImm_i<0b0011010, (ins simm8_unsigned:$imm8), "pli.db", + "$rd, $imm8"> { + bits<8> imm8; + + let Inst{24} = 0b0; + let Inst{23-16} = imm8; + let Inst{15} = 0b0; + } + + def PLUI_DH : RVPPairLoadImm_i<0b0111000, (ins simm10_unsigned:$imm10), + "plui.dh", "$rd, $imm10"> { + bits<10> imm10; + + let Inst{24} = imm10{0}; + let Inst{23-15} = imm10{9-1}; + } +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td index 33c7138..cebab21 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td @@ -1703,8 +1703,9 @@ let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in { defm VSLIDEUP_V : VSLD_IV_X_I<"vslideup", 0b001110, /*slidesUp=*/true>; defm VSLIDE1UP_V : VSLD1_MV_X<"vslide1up", 0b001110>; } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp +let ReadsPastVL = 1 in defm VSLIDEDOWN_V : VSLD_IV_X_I<"vslidedown", 0b001111, /*slidesUp=*/false>; -let ElementsDependOn = EltDepsVL in +let ElementsDependOn = EltDepsVL, ReadsPastVL = 1 in defm VSLIDE1DOWN_V : VSLD1_MV_X<"vslide1down", 0b001111>; } // Predicates = [HasVInstructions] @@ -1712,19 +1713,19 @@ let Predicates = [HasVInstructionsAnyF] in { let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in { defm VFSLIDE1UP_V : VSLD1_FV_F<"vfslide1up", 0b001110>; } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp -let ElementsDependOn = EltDepsVL in +let ElementsDependOn = EltDepsVL, ReadsPastVL = 1 in defm VFSLIDE1DOWN_V : VSLD1_FV_F<"vfslide1down", 0b001111>; } // Predicates = [HasVInstructionsAnyF] let Predicates = [HasVInstructions] in { // Vector Register Gather Instruction -let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in { +let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather, ReadsPastVL = 1 in { defm VRGATHER_V : VGTR_IV_V_X_I<"vrgather", 0b001100>; def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">, SchedBinaryMC<"WriteVRGatherEI16VV", "ReadVRGatherEI16VV_data", "ReadVRGatherEI16VV_index">; -} // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather +} // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather, ReadsPastVL = 1 // Vector Compress Instruction let Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress, ElementsDependOn = EltDepsVLMask in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td index c75addd9..1fb30a0b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td @@ -420,7 +420,7 @@ class NDSRVInstVD4DOT<bits<6> funct6, string opcodestr> } class NDSRVInstVBFHCvt<bits<5> vs1, string opcodestr> - : RVInst<(outs VR:$vd), (ins VR:$vs2, VMaskOp:$vm), + : RVInst<(outs VR:$vd), (ins VR:$vs2), opcodestr, "$vd, $vs2", [], InstFormatR> { bits<5> vs2; bits<5> vd; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td index 0c8487c..889ea98 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td @@ -129,20 +129,20 @@ class Mips_prefetch_ri<dag outs, dag ins, string opcodestr, string argstr> // MIPS extensions //===----------------------------------------------------------------------===// let Predicates = [HasVendorXMIPSCBOP] ,DecoderNamespace = "Xmipscbop" in { - def MIPS_PREFETCH : Mips_prefetch_ri<(outs), (ins GPR:$rs1, uimm9:$imm9, uimm5:$hint), - "mips.pref", "$hint, ${imm9}(${rs1})">, - Sched<[]>; + def MIPS_PREF : Mips_prefetch_ri<(outs), (ins GPR:$rs1, uimm9:$imm9, uimm5:$hint), + "mips.pref", "$hint, ${imm9}(${rs1})">, + Sched<[]>; } let Predicates = [HasVendorXMIPSCBOP] in { // Prefetch Data Write. def : Pat<(prefetch (AddrRegImm9 (XLenVT GPR:$rs1), uimm9:$imm9), (i32 1), timm, (i32 1)), - (MIPS_PREFETCH GPR:$rs1, uimm9:$imm9, 9)>; + (MIPS_PREF GPR:$rs1, uimm9:$imm9, 9)>; // Prefetch Data Read. def : Pat<(prefetch (AddrRegImm9 (XLenVT GPR:$rs1), uimm9:$imm9), (i32 0), timm, (i32 1)), - (MIPS_PREFETCH GPR:$rs1, uimm9:$imm9, 8)>; + (MIPS_PREF GPR:$rs1, uimm9:$imm9, 8)>; } let Predicates = [HasVendorXMIPSCMov], hasSideEffects = 0, mayLoad = 0, mayStore = 0, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td index ebcf079..3a6ce3c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td @@ -58,7 +58,7 @@ class CustomRivosXVI<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins, let Predicates = [HasVendorXRivosVizip], DecoderNamespace = "XRivos", Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather, - Inst<6-0> = OPC_CUSTOM_2.Value in { + Inst<6-0> = OPC_CUSTOM_2.Value, ReadsPastVL = 1 in { defm RI_VZIPEVEN_V : VALU_IV_V<"ri.vzipeven", 0b001100>; defm RI_VZIPODD_V : VALU_IV_V<"ri.vzipodd", 0b011100>; defm RI_VZIP2A_V : VALU_IV_V<"ri.vzip2a", 0b000100>; @@ -126,6 +126,7 @@ def RI_VINSERT : CustomRivosVXI<0b010000, OPMVX, (outs VR:$vd_wb), (ins VR:$vd, GPR:$rs1, uimm5:$imm), "ri.vinsert.v.x", "$vd, $rs1, $imm">; +let ReadsPastVL = 1 in def RI_VEXTRACT : CustomRivosXVI<0b010111, OPMVV, (outs GPR:$rd), (ins VR:$vs2, uimm5:$imm), "ri.vextract.x.v", "$rd, $vs2, $imm">; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index a47dfe3..b546339 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -74,6 +74,7 @@ class RVInstVCCustom2<bits<4> funct6_hi4, bits<3> funct3, dag outs, dag ins, let Uses = [VL, VTYPE]; let RVVConstraint = NoConstraint; let ElementsDependOn = EltDepsVLMask; + let ReadsPastVL = 1; } class RVInstVCFCustom2<bits<4> funct6_hi4, bits<3> funct3, dag outs, dag ins, @@ -98,6 +99,7 @@ class RVInstVCFCustom2<bits<4> funct6_hi4, bits<3> funct3, dag outs, dag ins, let Uses = [VL, VTYPE]; let RVVConstraint = NoConstraint; let ElementsDependOn = EltDepsVLMask; + let ReadsPastVL = 1; } class VCIXInfo<string suffix, VCIXType type, DAGOperand TyRd, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td index 66cb2d5..a5ee701 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td @@ -65,6 +65,7 @@ class SFInstTileMemOp<dag outs, dag ins, bits<3> nf, RISCVOpcode opcode, let Inst{6-0} = opcode.Value; let Uses = [VTYPE, VL]; + let ReadsPastVL = 1; } let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in @@ -94,6 +95,7 @@ class SFInstTileMoveOp<bits<6> funct6, dag outs, dag ins, string opcodestr, let Inst{6-0} = OPC_OP_V.Value; let Uses = [VTYPE, VL]; + let ReadsPastVL = 1; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in @@ -113,6 +115,7 @@ class SFInstMatmulF<dag outs, dag ins, string opcodestr, string argstr> let Inst{6-0} = OPC_OP_VE.Value; let Uses = [VTYPE, VL]; + let ReadsPastVL = 1; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in @@ -135,6 +138,7 @@ class SFInstMatmulF8<bit a, bit b, dag outs, dag ins, let Inst{6-0} = OPC_OP_VE.Value; let Uses = [VTYPE, VL]; + let ReadsPastVL = 1; } @@ -167,6 +171,7 @@ class SFInstMatmulI8<bit funct6_1, bit a, bit b, dag outs, dag ins, let Inst{6-0} = OPC_OP_VE.Value; let Uses = [VTYPE, VL]; + let ReadsPastVL = 1; } class I8Encode<bit encoding, string name> { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSpacemiT.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSpacemiT.td new file mode 100644 index 0000000..980931e --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSpacemiT.td @@ -0,0 +1,139 @@ +//===-- RISCVInstrInfoXSpacemiT.td -------------------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the vendor extensions defined by SpacemiT. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Operand definitions. +//===----------------------------------------------------------------------===// + +class SMTVDotOpcode<bits<7> val> { + bits<7> Value = val; +} + +class SMTVEncoding2<bits<2> val> { + bits<2> Value = val; +} + +def OPMMA : SMTVDotOpcode<0b1110001>; +def OPMMA_SLIDE : SMTVDotOpcode<0b1110011>; + +//===----------------------------------------------------------------------===// +// Vector Dot-Product Sign Encoding +// Defines the signed/unsigned mixing modes for vector dot-product operations. +// Encoding format: [1:0] bits +// 00: UU (Unsigned x Unsigned) +// 01: US (Unsigned x Signed) +// 10: SU (Signed x Unsigned) +// 11: SS (Signed x Signed) +//===----------------------------------------------------------------------===// +def SMT_VDot_UU : SMTVEncoding2<0b00>; +def SMT_VDot_US : SMTVEncoding2<0b01>; +def SMT_VDot_SU : SMTVEncoding2<0b10>; +def SMT_VDot_SS : SMTVEncoding2<0b11>; + +//===----------------------------------------------------------------------===// +// Vector Dot-Product Sliding Window Modes +// Encoding format: [1:0] bits +// 00: Slide1 (1-element sliding stride) +// 01: Slide2 (2-element sliding stride) +// 10: Slide3 (3-element sliding stride) +// 11: Reserved +// +// Used in sliding-window dot-product operations: +// vd = vs1 • vs2.slide{1|2|3} // • = dot product +//===----------------------------------------------------------------------===// +def SMT_VDot_Slide1 : SMTVEncoding2<0b00>; +def SMT_VDot_Slide2 : SMTVEncoding2<0b01>; +def SMT_VDot_Slide3 : SMTVEncoding2<0b10>; + +//===----------------------------------------------------------------------===// +// Instruction formats +//===----------------------------------------------------------------------===// + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +// Base vector dot product (no slide) format. +class RVInstSMTVDot<SMTVEncoding2 sign, string opcodestr, string argstr> + : RVInst<(outs VRM2:$vd), (ins VR:$vs1, VR:$vs2), opcodestr, argstr, [], InstFormatR> { + bits<5> vd; + bits<5> vs1; + bits<5> vs2; + + let Inst{31-25} = OPMMA.Value; + let Inst{24-20} = vs2; + let Inst{19-15} = vs1; + let Inst{14} = 0b0; + let Inst{13-12} = sign.Value; + let Inst{11-8} = vd{4-1}; + let Inst{7} = 0b0; + let Inst{6-0} = OPC_CUSTOM_1.Value; +} + +// Sliding-window vector dot product format. +class RVInstSMTVDotSlide<SMTVEncoding2 funct2, SMTVEncoding2 sign, string opcodestr, string argstr> + : RVInst<(outs VRM2:$vd), (ins VRM2:$vs1, VR:$vs2), opcodestr, argstr, [], InstFormatR> { + bits<5> vd; + bits<5> vs1; + bits<5> vs2; + + let Inst{31-25} = OPMMA_SLIDE.Value; + let Inst{24-20} = vs2; + let Inst{19-16} = vs1{4-1}; + let Inst{15-14} = funct2.Value; + let Inst{13-12} = sign.Value; + let Inst{11-8} = vd{4-1}; + let Inst{7} = 0b0; + let Inst{6-0} = OPC_CUSTOM_1.Value; +} +} + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +let DecoderNamespace = "XSMT" in { + +let Predicates = [HasVendorXSMTVDot], ElementsDependOn = EltDepsVL in { +// Base vector dot product (no slide) instructions +// NOTE: Destination registers (vd) MUST be even-numbered (v0, v2, ..., v30) +// due to hardware alignment constraints. Using odd registers may cause undefined behavior. +def VMADOT : RVInstSMTVDot<SMT_VDot_SS, "smt.vmadot", "$vd, $vs1, $vs2">; +def VMADOTU : RVInstSMTVDot<SMT_VDot_UU, "smt.vmadotu", "$vd, $vs1, $vs2">; +def VMADOTSU : RVInstSMTVDot<SMT_VDot_SU, "smt.vmadotsu", "$vd, $vs1, $vs2">; +def VMADOTUS : RVInstSMTVDot<SMT_VDot_US, "smt.vmadotus", "$vd, $vs1, $vs2">; + +//===----------------------------------------------------------------------===// +// Sliding-window Vector Dot Product Instructions +// +// The numeric suffix (1, 2, 3) specifies the stride of the sliding window: +// 1: Window slides by 1 element per operation +// 2: Window slides by 2 elements per operation +// 3: Window slides by 3 elements per operation +// +// These instructions compute dot products with overlapping operand windows +// where the window position increments by <N> elements between computations. +//===----------------------------------------------------------------------===// +// NOTE: Destination registers (vd) and first source register (vs1) MUST be +// even-numbered (v0, v2, ..., v30) due to hardware alignment constraints. +// Using odd registers may cause undefined behavior. +def VMADOT1 : RVInstSMTVDotSlide<SMT_VDot_Slide1, SMT_VDot_SS, "smt.vmadot1", "$vd, $vs1, $vs2">; +def VMADOT1U : RVInstSMTVDotSlide<SMT_VDot_Slide1, SMT_VDot_UU, "smt.vmadot1u", "$vd, $vs1, $vs2">; +def VMADOT1SU : RVInstSMTVDotSlide<SMT_VDot_Slide1, SMT_VDot_SU, "smt.vmadot1su", "$vd, $vs1, $vs2">; +def VMADOT1US : RVInstSMTVDotSlide<SMT_VDot_Slide1, SMT_VDot_US, "smt.vmadot1us", "$vd, $vs1, $vs2">; +def VMADOT2 : RVInstSMTVDotSlide<SMT_VDot_Slide2, SMT_VDot_SS, "smt.vmadot2", "$vd, $vs1, $vs2">; +def VMADOT2U : RVInstSMTVDotSlide<SMT_VDot_Slide2, SMT_VDot_UU, "smt.vmadot2u", "$vd, $vs1, $vs2">; +def VMADOT2SU : RVInstSMTVDotSlide<SMT_VDot_Slide2, SMT_VDot_SU, "smt.vmadot2su", "$vd, $vs1, $vs2">; +def VMADOT2US : RVInstSMTVDotSlide<SMT_VDot_Slide2, SMT_VDot_US, "smt.vmadot2us", "$vd, $vs1, $vs2">; +def VMADOT3 : RVInstSMTVDotSlide<SMT_VDot_Slide3, SMT_VDot_SS, "smt.vmadot3", "$vd, $vs1, $vs2">; +def VMADOT3U : RVInstSMTVDotSlide<SMT_VDot_Slide3, SMT_VDot_UU, "smt.vmadot3u", "$vd, $vs1, $vs2">; +def VMADOT3SU : RVInstSMTVDotSlide<SMT_VDot_Slide3, SMT_VDot_SU, "smt.vmadot3su", "$vd, $vs1, $vs2">; +def VMADOT3US : RVInstSMTVDotSlide<SMT_VDot_Slide3, SMT_VDot_US, "smt.vmadot3us", "$vd, $vs1, $vs2">; +} +}
\ No newline at end of file diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 413ad8b..a31afaa 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -692,6 +692,21 @@ def : Pat<(binop_allwusers<or> (shl GPR:$op1rs1, (XLenVT 24))), (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))), (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; + +def : Pat<(i64 (or (or (zexti16 (XLenVT GPR:$rs1)), + (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))), + (sext_inreg (shl GPR:$op1rs1, (XLenVT 24)), i32))), + (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; + +// Match a pattern of 2 halfwords being inserted into bits [63:32], with bits +// bits [31:0] coming from a zero extended value. We can use pack with packw for +// bits [63:32]. If bits [63:31] can also be a packw, it can be matched +// separately. +def : Pat<(or (or (shl GPR:$op1rs2, (i64 48)), + (shl (zexti16 (i64 GPR:$op1rs1)), (i64 32))), + (zexti32 (i64 GPR:$rs1))), + (PACK (XLenVT GPR:$rs1), + (XLenVT (PACKW GPR:$op1rs1, GPR:$op1rs2)))>; } // Predicates = [HasStdExtZbkb, IsRV64] let Predicates = [HasStdExtZbb, IsRV32] in diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 726920e..c7b96f5 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -63,6 +63,12 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = { Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask, Intrinsic::riscv_seg8_load_mask}; +static const Intrinsic::ID FixedVlssegIntrIds[] = { + Intrinsic::riscv_sseg2_load_mask, Intrinsic::riscv_sseg3_load_mask, + Intrinsic::riscv_sseg4_load_mask, Intrinsic::riscv_sseg5_load_mask, + Intrinsic::riscv_sseg6_load_mask, Intrinsic::riscv_sseg7_load_mask, + Intrinsic::riscv_sseg8_load_mask}; + static const Intrinsic::ID ScalableVlsegIntrIds[] = { Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, @@ -197,9 +203,15 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy, /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool RISCVTargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, - ArrayRef<unsigned> Indices, unsigned Factor) const { + ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const { assert(Indices.size() == Shuffles.size()); + assert(GapMask.getBitWidth() == Factor); + // We only support cases where the skipped fields are the trailing ones. + // TODO: Lower to strided load if there is only a single active field. + unsigned MaskFactor = GapMask.popcount(); + if (MaskFactor < 2 || !GapMask.isMask()) + return false; IRBuilder<> Builder(Load); const DataLayout &DL = Load->getDataLayout(); @@ -208,20 +220,37 @@ bool RISCVTargetLowering::lowerInterleavedLoad( Value *Ptr, *VL; Align Alignment; - if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment)) + if (!getMemOperands(MaskFactor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment)) return false; Type *PtrTy = Ptr->getType(); unsigned AS = PtrTy->getPointerAddressSpace(); - if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) + if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL)) return false; - CallInst *VlsegN = Builder.CreateIntrinsic( - FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); + CallInst *SegLoad = nullptr; + if (MaskFactor < Factor) { + // Lower to strided segmented load. + unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); + Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); + SegLoad = Builder.CreateIntrinsic(FixedVlssegIntrIds[MaskFactor - 2], + {VTy, PtrTy, XLenTy, XLenTy}, + {Ptr, Stride, Mask, VL}); + } else { + // Lower to normal segmented load. + SegLoad = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], + {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); + } for (unsigned i = 0; i < Shuffles.size(); i++) { - Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); - Shuffles[i]->replaceAllUsesWith(SubVec); + unsigned FactorIdx = Indices[i]; + if (FactorIdx >= MaskFactor) { + // Replace masked-off factors (that are still extracted) with poison. + Shuffles[i]->replaceAllUsesWith(PoisonValue::get(VTy)); + } else { + Value *SubVec = Builder.CreateExtractValue(SegLoad, FactorIdx); + Shuffles[i]->replaceAllUsesWith(SubVec); + } } return true; diff --git a/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp b/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp index 7a2541a..d234dcf 100644 --- a/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp +++ b/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp @@ -26,6 +26,7 @@ struct RISCVMoveMerge : public MachineFunctionPass { RISCVMoveMerge() : MachineFunctionPass(ID) {} + const RISCVSubtarget *ST; const RISCVInstrInfo *TII; const TargetRegisterInfo *TRI; @@ -37,15 +38,15 @@ struct RISCVMoveMerge : public MachineFunctionPass { // Merge the two instructions indicated into a single pair instruction. MachineBasicBlock::iterator mergePairedInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, unsigned Opcode); + MachineBasicBlock::iterator Paired, bool MoveFromSToA); // Look for C.MV instruction that can be combined with // the given instruction into CM.MVA01S or CM.MVSA01. Return the matching // instruction if one exists. MachineBasicBlock::iterator - findMatchingInst(MachineBasicBlock::iterator &MBBI, unsigned InstOpcode, + findMatchingInst(MachineBasicBlock::iterator &MBBI, bool MoveFromSToA, const DestSourcePair &RegPair); - bool mergeMoveSARegPair(const RISCVSubtarget &STI, MachineBasicBlock &MBB); + bool mergeMoveSARegPair(MachineBasicBlock &MBB); bool runOnMachineFunction(MachineFunction &Fn) override; StringRef getPassName() const override { return RISCV_MOVE_MERGE_NAME; } @@ -58,41 +59,21 @@ char RISCVMoveMerge::ID = 0; INITIALIZE_PASS(RISCVMoveMerge, "riscv-move-merge", RISCV_MOVE_MERGE_NAME, false, false) -static bool isMoveFromAToS(unsigned Opcode) { - switch (Opcode) { - case RISCV::CM_MVA01S: - case RISCV::QC_CM_MVA01S: - return true; - default: - return false; - } -} - -static unsigned getMoveFromAToSOpcode(const RISCVSubtarget &STI) { - if (STI.hasStdExtZcmp()) +static unsigned getMoveFromSToAOpcode(const RISCVSubtarget &ST) { + if (ST.hasStdExtZcmp()) return RISCV::CM_MVA01S; - if (STI.hasVendorXqccmp()) + if (ST.hasVendorXqccmp()) return RISCV::QC_CM_MVA01S; llvm_unreachable("Unhandled subtarget with paired A to S move."); } -static bool isMoveFromSToA(unsigned Opcode) { - switch (Opcode) { - case RISCV::CM_MVSA01: - case RISCV::QC_CM_MVSA01: - return true; - default: - return false; - } -} - -static unsigned getMoveFromSToAOpcode(const RISCVSubtarget &STI) { - if (STI.hasStdExtZcmp()) +static unsigned getMoveFromAToSOpcode(const RISCVSubtarget &ST) { + if (ST.hasStdExtZcmp()) return RISCV::CM_MVSA01; - if (STI.hasVendorXqccmp()) + if (ST.hasVendorXqccmp()) return RISCV::QC_CM_MVSA01; llvm_unreachable("Unhandled subtarget with paired S to A move"); @@ -123,20 +104,24 @@ bool RISCVMoveMerge::isCandidateToMergeMVSA01(const DestSourcePair &RegPair) { MachineBasicBlock::iterator RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned Opcode) { + bool MoveFromSToA) { const MachineOperand *Sreg1, *Sreg2; MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator NextI = next_nodbg(I, E); DestSourcePair FirstPair = TII->isCopyInstrImpl(*I).value(); DestSourcePair PairedRegs = TII->isCopyInstrImpl(*Paired).value(); - Register ARegInFirstPair = isMoveFromAToS(Opcode) - ? FirstPair.Destination->getReg() - : FirstPair.Source->getReg(); + Register ARegInFirstPair = MoveFromSToA ? FirstPair.Destination->getReg() + : FirstPair.Source->getReg(); if (NextI == Paired) NextI = next_nodbg(NextI, E); DebugLoc DL = I->getDebugLoc(); + // Make a copy so we can update the kill flag in the MoveFromSToA case. The + // copied operand needs to be scoped outside the if since we make a pointer + // to it. + MachineOperand PairedSource = *PairedRegs.Source; + // The order of S-reg depends on which instruction holds A0, instead of // the order of register pair. // e,g. @@ -146,10 +131,20 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I, // mv a0, s2 // mv a1, s1 => cm.mva01s s2,s1 bool StartWithX10 = ARegInFirstPair == RISCV::X10; - if (isMoveFromAToS(Opcode)) { - Sreg1 = StartWithX10 ? FirstPair.Source : PairedRegs.Source; - Sreg2 = StartWithX10 ? PairedRegs.Source : FirstPair.Source; + unsigned Opcode; + if (MoveFromSToA) { + // We are moving one of the copies earlier so its kill flag may become + // invalid. Clear the copied kill flag if there are any reads of the + // register between the new location and the old location. + for (auto It = std::next(I); It != Paired && PairedSource.isKill(); ++It) + if (It->readsRegister(PairedSource.getReg(), TRI)) + PairedSource.setIsKill(false); + + Opcode = getMoveFromSToAOpcode(*ST); + Sreg1 = StartWithX10 ? FirstPair.Source : &PairedSource; + Sreg2 = StartWithX10 ? &PairedSource : FirstPair.Source; } else { + Opcode = getMoveFromAToSOpcode(*ST); Sreg1 = StartWithX10 ? FirstPair.Destination : PairedRegs.Destination; Sreg2 = StartWithX10 ? PairedRegs.Destination : FirstPair.Destination; } @@ -163,7 +158,7 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator RISCVMoveMerge::findMatchingInst(MachineBasicBlock::iterator &MBBI, - unsigned InstOpcode, + bool MoveFromSToA, const DestSourcePair &RegPair) { MachineBasicBlock::iterator E = MBBI->getParent()->end(); @@ -181,7 +176,7 @@ RISCVMoveMerge::findMatchingInst(MachineBasicBlock::iterator &MBBI, Register SourceReg = SecondPair->Source->getReg(); Register DestReg = SecondPair->Destination->getReg(); - if (isMoveFromAToS(InstOpcode) && isCandidateToMergeMVA01S(*SecondPair)) { + if (MoveFromSToA && isCandidateToMergeMVA01S(*SecondPair)) { // If register pair is valid and destination registers are different. if ((RegPair.Destination->getReg() == DestReg)) return E; @@ -195,8 +190,7 @@ RISCVMoveMerge::findMatchingInst(MachineBasicBlock::iterator &MBBI, return E; return I; - } else if (isMoveFromSToA(InstOpcode) && - isCandidateToMergeMVSA01(*SecondPair)) { + } else if (!MoveFromSToA && isCandidateToMergeMVSA01(*SecondPair)) { if ((RegPair.Source->getReg() == SourceReg) || (RegPair.Destination->getReg() == DestReg)) return E; @@ -217,8 +211,7 @@ RISCVMoveMerge::findMatchingInst(MachineBasicBlock::iterator &MBBI, // Finds instructions, which could be represented as C.MV instructions and // merged into CM.MVA01S or CM.MVSA01. -bool RISCVMoveMerge::mergeMoveSARegPair(const RISCVSubtarget &STI, - MachineBasicBlock &MBB) { +bool RISCVMoveMerge::mergeMoveSARegPair(MachineBasicBlock &MBB) { bool Modified = false; for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); @@ -227,22 +220,17 @@ bool RISCVMoveMerge::mergeMoveSARegPair(const RISCVSubtarget &STI, // can, return Dest/Src register pair. auto RegPair = TII->isCopyInstrImpl(*MBBI); if (RegPair.has_value()) { - unsigned Opcode = 0; - - if (isCandidateToMergeMVA01S(*RegPair)) - Opcode = getMoveFromAToSOpcode(STI); - else if (isCandidateToMergeMVSA01(*RegPair)) - Opcode = getMoveFromSToAOpcode(STI); - else { + bool MoveFromSToA = isCandidateToMergeMVA01S(*RegPair); + if (!MoveFromSToA && !isCandidateToMergeMVSA01(*RegPair)) { ++MBBI; continue; } MachineBasicBlock::iterator Paired = - findMatchingInst(MBBI, Opcode, RegPair.value()); + findMatchingInst(MBBI, MoveFromSToA, RegPair.value()); // If matching instruction can be found merge them. if (Paired != E) { - MBBI = mergePairedInsns(MBBI, Paired, Opcode); + MBBI = mergePairedInsns(MBBI, Paired, MoveFromSToA); Modified = true; continue; } @@ -256,12 +244,12 @@ bool RISCVMoveMerge::runOnMachineFunction(MachineFunction &Fn) { if (skipFunction(Fn.getFunction())) return false; - const RISCVSubtarget *Subtarget = &Fn.getSubtarget<RISCVSubtarget>(); - if (!(Subtarget->hasStdExtZcmp() || Subtarget->hasVendorXqccmp())) + ST = &Fn.getSubtarget<RISCVSubtarget>(); + if (!ST->hasStdExtZcmp() && !ST->hasVendorXqccmp()) return false; - TII = Subtarget->getInstrInfo(); - TRI = Subtarget->getRegisterInfo(); + TII = ST->getInstrInfo(); + TRI = ST->getRegisterInfo(); // Resize the modified and used register unit trackers. We do this once // per function and then clear the register units each time we optimize a // move. @@ -269,7 +257,7 @@ bool RISCVMoveMerge::runOnMachineFunction(MachineFunction &Fn) { UsedRegUnits.init(*TRI); bool Modified = false; for (auto &MBB : Fn) - Modified |= mergeMoveSARegPair(*Subtarget, MBB); + Modified |= mergeMoveSARegPair(MBB); return Modified; } diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 31d2b3a..f89d94f 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -673,6 +673,7 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60", FeatureStdExtZvfh, FeatureStdExtZvkt, FeatureStdExtZvl256b, + FeatureVendorXSMTVDot, FeatureUnalignedScalarMem]), [TuneDLenFactor2, TuneOptimizedNF2SegmentLoadStore, diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index 7e58b6f..8a3c8e2 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -589,7 +589,7 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, (Lo12 & 0b11111) != 0) { // Prefetch instructions require the offset to be 32 byte aligned. MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); - } else if (Opc == RISCV::MIPS_PREFETCH && !isUInt<9>(Val)) { + } else if (Opc == RISCV::MIPS_PREF && !isUInt<9>(Val)) { // MIPS Prefetch instructions require the offset to be 9 bits encoded. MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); } else if ((Opc == RISCV::PseudoRV32ZdinxLD || diff --git a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td index 5ef858a..8cf15fa 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td +++ b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td @@ -24,7 +24,7 @@ let SchedModel = Andes45Model in { //===----------------------------------------------------------------------===// // Andes 45 series CPU -// - 2 Interger Arithmetic and Logical Units (ALU) +// - 2 Integer Arithmetic and Logical Units (ALU) // - Multiply / Divide Unit (MDU) // - Load Store Unit (LSU) // - Control and Status Register Unit (CSR) diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index 5541506..24ebbc3 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -524,16 +524,33 @@ foreach mx = SchedMxListW in { foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>; + let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in { + defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>; + } + + // Latency of vsmul: e8/e16 = 4/4/5/8, e32 = 5/5/5/8, e64 = 7/8/16/32 + // We use the worst-case until we can split the SEW. + defvar VSMulLat = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c; + // Latency of vsmul: e8/e16/e32 = 1/2/4/8, e64 = 4/8/16/32 + // We use the worst-case until we can split the SEW. + defvar VSMulOcc = ConstValueUntilLMULThenDoubleBase<"M1", 1, 4, mx>.c; + // TODO: change WriteVSMulV/X to be defined with LMULSEWSchedWrites + let Latency = VSMulLat, ReleaseAtCycles = [VSMulOcc] in { + defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>; + } + + defvar VSShiftLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + defvar VSShiftOcc = ConstOneUntilMF2ThenDouble<mx>.c; + let Latency = VSShiftLat, ReleaseAtCycles = [VSShiftOcc] in { + defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>; + } } // 13. Vector Floating-Point Instructions diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 66ce134..c70571c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -38,7 +38,6 @@ #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h" #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" #include <optional> using namespace llvm; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 67f924a..c707fb1 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1431,7 +1431,7 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, case Intrinsic::ctlz: case Intrinsic::ctpop: { auto LT = getTypeLegalizationCost(RetTy); - if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) { + if (ST->hasStdExtZvbb() && LT.second.isVector()) { unsigned Op; switch (ICA.getID()) { case Intrinsic::cttz: @@ -1629,6 +1629,7 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, // scalarized if the legalized Src and Dst are not equal sized. const DataLayout &DL = this->getDataLayout(); if (!SrcLT.second.isVector() || !DstLT.second.isVector() || + !SrcLT.first.isValid() || !DstLT.first.isValid() || !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src), SrcLT.second.getSizeInBits()) || !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst), @@ -2414,6 +2415,24 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return BaseCost + SlideCost; } +InstructionCost +RISCVTTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { + if (isa<FixedVectorType>(Val)) + return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, + Index); + + // TODO: This code replicates what LoopVectorize.cpp used to do when asking + // for the cost of extracting the last lane of a scalable vector. It probably + // needs a more accurate cost. + ElementCount EC = cast<VectorType>(Val)->getElementCount(); + assert(Index < EC.getKnownMinValue() && "Unexpected reverse index"); + return getVectorInstrCost(Opcode, Val, CostKind, + EC.getKnownMinValue() - 1 - Index, nullptr, + nullptr); +} + InstructionCost RISCVTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 05d504c..b632f25 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -114,6 +114,9 @@ public: bool enableScalableVectorization() const override { return ST->hasVInstructions(); } + bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override { + return ST->hasVInstructions(); + } TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override { return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL @@ -240,6 +243,11 @@ public: unsigned Index, const Value *Op0, const Value *Op1) const override; + InstructionCost + getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const override; + InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None}, diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 37a71e8..f973e75 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -491,8 +491,42 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { // vfirst find-first-set mask bit case RISCV::VCPOP_M: case RISCV::VFIRST_M: + // Vector Bit-manipulation Instructions (Zvbb) + // Vector And-Not + case RISCV::VANDN_VV: + case RISCV::VANDN_VX: + // Vector Reverse Bits in Elements + case RISCV::VBREV_V: + // Vector Reverse Bits in Bytes + case RISCV::VBREV8_V: + // Vector Reverse Bytes + case RISCV::VREV8_V: + // Vector Count Leading Zeros + case RISCV::VCLZ_V: + // Vector Count Trailing Zeros + case RISCV::VCTZ_V: + // Vector Population Count + case RISCV::VCPOP_V: + // Vector Rotate Left + case RISCV::VROL_VV: + case RISCV::VROL_VX: + // Vector Rotate Right + case RISCV::VROR_VI: + case RISCV::VROR_VV: + case RISCV::VROR_VX: + // Vector Carry-less Multiplication Instructions (Zvbc) + // Vector Carry-less Multiply + case RISCV::VCLMUL_VV: + case RISCV::VCLMUL_VX: + // Vector Carry-less Multiply Return High Half + case RISCV::VCLMULH_VV: + case RISCV::VCLMULH_VX: return MILog2SEW; + // Vector Widening Shift Left Logical (Zvbb) + case RISCV::VWSLL_VI: + case RISCV::VWSLL_VX: + case RISCV::VWSLL_VV: // Vector Widening Integer Add/Subtract // Def uses EEW=2*SEW . Operands use EEW=SEW. case RISCV::VWADDU_VV: @@ -503,9 +537,6 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { case RISCV::VWADD_VX: case RISCV::VWSUB_VV: case RISCV::VWSUB_VX: - case RISCV::VWSLL_VI: - case RISCV::VWSLL_VX: - case RISCV::VWSLL_VV: // Vector Widening Integer Multiply Instructions // Destination EEW=2*SEW. Source EEW=SEW. case RISCV::VWMUL_VV: @@ -1020,12 +1051,40 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VNCLIP_WV: case RISCV::VNCLIP_WX: case RISCV::VNCLIP_WI: - - // Vector Crypto + // Vector Bit-manipulation Instructions (Zvbb) + // Vector And-Not + case RISCV::VANDN_VV: + case RISCV::VANDN_VX: + // Vector Reverse Bits in Elements + case RISCV::VBREV_V: + // Vector Reverse Bits in Bytes + case RISCV::VBREV8_V: + // Vector Reverse Bytes + case RISCV::VREV8_V: + // Vector Count Leading Zeros + case RISCV::VCLZ_V: + // Vector Count Trailing Zeros + case RISCV::VCTZ_V: + // Vector Population Count + case RISCV::VCPOP_V: + // Vector Rotate Left + case RISCV::VROL_VV: + case RISCV::VROL_VX: + // Vector Rotate Right + case RISCV::VROR_VI: + case RISCV::VROR_VV: + case RISCV::VROR_VX: + // Vector Widening Shift Left Logical case RISCV::VWSLL_VI: case RISCV::VWSLL_VX: case RISCV::VWSLL_VV: - + // Vector Carry-less Multiplication Instructions (Zvbc) + // Vector Carry-less Multiply + case RISCV::VCLMUL_VV: + case RISCV::VCLMUL_VX: + // Vector Carry-less Multiply Return High Half + case RISCV::VCLMULH_VV: + case RISCV::VCLMULH_VX: // Vector Mask Instructions // Vector Mask-Register Logical Instructions // vmsbf.m set-before-first mask bit @@ -1213,34 +1272,6 @@ static bool isVectorOpUsedAsScalarOp(const MachineOperand &MO) { } } -/// Return true if MI may read elements past VL. -static bool mayReadPastVL(const MachineInstr &MI) { - const RISCVVPseudosTable::PseudoInfo *RVV = - RISCVVPseudosTable::getPseudoInfo(MI.getOpcode()); - if (!RVV) - return true; - - switch (RVV->BaseInstr) { - // vslidedown instructions may read elements past VL. They are handled - // according to current tail policy. - case RISCV::VSLIDEDOWN_VI: - case RISCV::VSLIDEDOWN_VX: - case RISCV::VSLIDE1DOWN_VX: - case RISCV::VFSLIDE1DOWN_VF: - - // vrgather instructions may read the source vector at any index < VLMAX, - // regardless of VL. - case RISCV::VRGATHER_VI: - case RISCV::VRGATHER_VV: - case RISCV::VRGATHER_VX: - case RISCV::VRGATHEREI16_VV: - return true; - - default: - return false; - } -} - bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { const MCInstrDesc &Desc = MI.getDesc(); if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) @@ -1301,7 +1332,8 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const { return std::nullopt; } - if (mayReadPastVL(UserMI)) { + if (RISCVII::readsPastVL( + TII->get(RISCV::getRVVMCOpcode(UserMI.getOpcode())).TSFlags)) { LLVM_DEBUG(dbgs() << " Abort because used by unsafe instruction\n"); return std::nullopt; } diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp index 0ed97f5..d6b6079 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp @@ -38,8 +38,15 @@ struct CapabilityEntry { Capability::Capability ReqCapability; }; +struct EnvironmentEntry { + OperandCategory::OperandCategory Category; + uint32_t Value; + Environment::Environment AllowedEnvironment; +}; + using namespace OperandCategory; using namespace Extension; +using namespace Environment; using namespace Capability; using namespace InstructionSet; #define GET_SymbolicOperands_DECL @@ -48,6 +55,8 @@ using namespace InstructionSet; #define GET_ExtensionEntries_IMPL #define GET_CapabilityEntries_DECL #define GET_CapabilityEntries_IMPL +#define GET_EnvironmentEntries_DECL +#define GET_EnvironmentEntries_IMPL #define GET_ExtendedBuiltins_DECL #define GET_ExtendedBuiltins_IMPL #include "SPIRVGenTables.inc" @@ -133,6 +142,23 @@ getSymbolicOperandCapabilities(SPIRV::OperandCategory::OperandCategory Category, return Capabilities; } +EnvironmentList getSymbolicOperandAllowedEnvironments( + SPIRV::OperandCategory::OperandCategory Category, uint32_t Value) { + EnvironmentList Environments; + const SPIRV::EnvironmentEntry *Environment = + SPIRV::lookupEnvironmentByCategoryAndValue(Category, Value); + auto TableEnd = ArrayRef(SPIRV::EnvironmentEntries).end(); + while (Environment && Environment->Category == Category && + Environment->Value == Value) { + Environments.push_back(static_cast<SPIRV::Environment::Environment>( + Environment->AllowedEnvironment)); + if (++Environment == TableEnd) + break; + } + + return Environments; +} + CapabilityList getCapabilitiesEnabledByExtension(SPIRV::Extension::Extension Extension) { const SPIRV::ExtensionEntry *Entry = diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h index b8c467f..c2c08f8 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h @@ -37,6 +37,11 @@ namespace Capability { #include "SPIRVGenTables.inc" } // namespace Capability +namespace Environment { +#define GET_Environment_DECL +#include "SPIRVGenTables.inc" +} // namespace Environment + namespace SourceLanguage { #define GET_SourceLanguage_DECL #include "SPIRVGenTables.inc" @@ -241,6 +246,7 @@ enum InstFlags { using CapabilityList = SmallVector<SPIRV::Capability::Capability, 8>; using ExtensionList = SmallVector<SPIRV::Extension::Extension, 8>; +using EnvironmentList = SmallVector<SPIRV::Environment::Environment, 8>; std::string getSymbolicOperandMnemonic(SPIRV::OperandCategory::OperandCategory Category, @@ -254,6 +260,8 @@ getSymbolicOperandMaxVersion(SPIRV::OperandCategory::OperandCategory Category, CapabilityList getSymbolicOperandCapabilities(SPIRV::OperandCategory::OperandCategory Category, uint32_t Value); +EnvironmentList getSymbolicOperandAllowedEnvironments( + SPIRV::OperandCategory::OperandCategory Category, uint32_t Value); CapabilityList getCapabilitiesEnabledByExtension(SPIRV::Extension::Extension Extension); ExtensionList diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp index 4ec31bf..1e3f7fc 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp @@ -375,9 +375,17 @@ void SPIRVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) O << '%' << (getIDFromRegister(Op.getReg().id()) + 1); - else if (Op.isImm()) - O << formatImm(Op.getImm()); - else if (Op.isDFPImm()) + else if (Op.isImm()) { + int64_t Imm = Op.getImm(); + // For OpVectorShuffle: + // A Component literal may also be FFFFFFFF, which means the corresponding + // result component has no source and is undefined. + // LLVM representation of poison/undef becomes -1 when lowered to MI. + if (MI->getOpcode() == SPIRV::OpVectorShuffle && Imm == -1) + O << "0xFFFFFFFF"; + else + O << formatImm(Imm); + } else if (Op.isDFPImm()) O << formatImm((double)Op.getDFPImm()); else if (Op.isExpr()) MAI.printExpr(O, *Op.getExpr()); diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index d9265f4..5a5860a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -12,7 +12,8 @@ //===----------------------------------------------------------------------===// #include "SPIRVCommandLine.h" -#include "llvm/ADT/StringRef.h" +#include "MCTargetDesc/SPIRVBaseInfo.h" +#include "llvm/TargetParser/Triple.h" #include <algorithm> #include <map> @@ -171,3 +172,23 @@ StringRef SPIRVExtensionsParser::checkExtensions( } return StringRef(); } + +std::set<SPIRV::Extension::Extension> +SPIRVExtensionsParser::getValidExtensions(const Triple &TT) { + std::set<SPIRV::Extension::Extension> R; + SPIRV::Environment::Environment CurrentEnvironment = + SPIRV::Environment::Environment::EnvOpenCL; + if (TT.getOS() == Triple::Vulkan) + CurrentEnvironment = SPIRV::Environment::Environment::EnvVulkan; + + for (const auto &[ExtensionName, ExtensionEnum] : SPIRVExtensionMap) { + EnvironmentList AllowedEnv = getSymbolicOperandAllowedEnvironments( + SPIRV::OperandCategory::OperandCategory::ExtensionOperand, + ExtensionEnum); + + if (std::count(AllowedEnv.begin(), AllowedEnv.end(), CurrentEnvironment)) + R.insert(ExtensionEnum); + } + + return R; +} diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.h b/llvm/lib/Target/SPIRV/SPIRVCommandLine.h index 3e3b22b..02e847b3 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.h +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.h @@ -21,6 +21,7 @@ namespace llvm { class StringRef; +class Triple; /// Command line parser for toggling SPIR-V extensions. struct SPIRVExtensionsParser @@ -42,6 +43,11 @@ public: static StringRef checkExtensions(const std::vector<std::string> &ExtNames, std::set<SPIRV::Extension::Extension> &AllowedExtensions); + + /// Returns the list of extensions that are valid for a particular + /// target environment (i.e., OpenCL or Vulkan). + static std::set<SPIRV::Extension::Extension> + getValidExtensions(const Triple &TT); }; } // namespace llvm diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 5259db1..98c7709 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -220,8 +220,10 @@ private: bool selectConst(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; - bool selectSelect(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, - bool IsSigned) const; + bool selectSelect(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I) const; + bool selectSelectDefaultArgs(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I, bool IsSigned) const; bool selectIToF(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, bool IsSigned, unsigned Opcode) const; bool selectExt(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, @@ -510,7 +512,18 @@ bool SPIRVInstructionSelector::select(MachineInstr &I) { if (isTypeFoldingSupported(Def->getOpcode()) && Def->getOpcode() != TargetOpcode::G_CONSTANT && Def->getOpcode() != TargetOpcode::G_FCONSTANT) { - bool Res = selectImpl(I, *CoverageInfo); + bool Res = false; + if (Def->getOpcode() == TargetOpcode::G_SELECT) { + Register SelectDstReg = Def->getOperand(0).getReg(); + Res = selectSelect(SelectDstReg, GR.getSPIRVTypeForVReg(SelectDstReg), + *Def); + GR.invalidateMachineInstr(Def); + Def->removeFromParent(); + MRI->replaceRegWith(DstReg, SelectDstReg); + GR.invalidateMachineInstr(&I); + I.removeFromParent(); + } else + Res = selectImpl(I, *CoverageInfo); LLVM_DEBUG({ if (!Res && Def->getOpcode() != TargetOpcode::G_CONSTANT) { dbgs() << "Unexpected pattern in ASSIGN_TYPE.\nInstruction: "; @@ -2565,8 +2578,52 @@ Register SPIRVInstructionSelector::buildOnesVal(bool AllOnes, bool SPIRVInstructionSelector::selectSelect(Register ResVReg, const SPIRVType *ResType, - MachineInstr &I, - bool IsSigned) const { + MachineInstr &I) const { + Register SelectFirstArg = I.getOperand(2).getReg(); + Register SelectSecondArg = I.getOperand(3).getReg(); + assert(ResType == GR.getSPIRVTypeForVReg(SelectFirstArg) && + ResType == GR.getSPIRVTypeForVReg(SelectSecondArg)); + + bool IsFloatTy = + GR.isScalarOrVectorOfType(SelectFirstArg, SPIRV::OpTypeFloat); + bool IsPtrTy = + GR.isScalarOrVectorOfType(SelectFirstArg, SPIRV::OpTypePointer); + bool IsVectorTy = GR.getSPIRVTypeForVReg(SelectFirstArg)->getOpcode() == + SPIRV::OpTypeVector; + + bool IsScalarBool = + GR.isScalarOfType(I.getOperand(1).getReg(), SPIRV::OpTypeBool); + unsigned Opcode; + if (IsVectorTy) { + if (IsFloatTy) { + Opcode = IsScalarBool ? SPIRV::OpSelectVFSCond : SPIRV::OpSelectVFVCond; + } else if (IsPtrTy) { + Opcode = IsScalarBool ? SPIRV::OpSelectVPSCond : SPIRV::OpSelectVPVCond; + } else { + Opcode = IsScalarBool ? SPIRV::OpSelectVISCond : SPIRV::OpSelectVIVCond; + } + } else { + if (IsFloatTy) { + Opcode = IsScalarBool ? SPIRV::OpSelectSFSCond : SPIRV::OpSelectVFVCond; + } else if (IsPtrTy) { + Opcode = IsScalarBool ? SPIRV::OpSelectSPSCond : SPIRV::OpSelectVPVCond; + } else { + Opcode = IsScalarBool ? SPIRV::OpSelectSISCond : SPIRV::OpSelectVIVCond; + } + } + return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(I.getOperand(1).getReg()) + .addUse(SelectFirstArg) + .addUse(SelectSecondArg) + .constrainAllUses(TII, TRI, RBI); +} + +bool SPIRVInstructionSelector::selectSelectDefaultArgs(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I, + bool IsSigned) const { // To extend a bool, we need to use OpSelect between constants. Register ZeroReg = buildZerosVal(ResType, I); Register OneReg = buildOnesVal(IsSigned, ResType, I); @@ -2598,7 +2655,7 @@ bool SPIRVInstructionSelector::selectIToF(Register ResVReg, TmpType = GR.getOrCreateSPIRVVectorType(TmpType, NumElts, I, TII); } SrcReg = createVirtualRegister(TmpType, &GR, MRI, MRI->getMF()); - selectSelect(SrcReg, TmpType, I, false); + selectSelectDefaultArgs(SrcReg, TmpType, I, false); } return selectOpWithSrcs(ResVReg, ResType, I, {SrcReg}, Opcode); } @@ -2608,7 +2665,7 @@ bool SPIRVInstructionSelector::selectExt(Register ResVReg, MachineInstr &I, bool IsSigned) const { Register SrcReg = I.getOperand(1).getReg(); if (GR.isScalarOrVectorOfType(SrcReg, SPIRV::OpTypeBool)) - return selectSelect(ResVReg, ResType, I, IsSigned); + return selectSelectDefaultArgs(ResVReg, ResType, I, IsSigned); SPIRVType *SrcType = GR.getSPIRVTypeForVReg(SrcReg); if (SrcType == ResType) diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp index 0398e52..aea3397 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp @@ -15,7 +15,6 @@ #include "SPIRV.h" #include "llvm/ADT/BitVector.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index b62db7f..1a08c6a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -441,13 +441,10 @@ void insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpvType, // Tablegen definition assumes SPIRV::ASSIGN_TYPE pseudo-instruction is // present after each auto-folded instruction to take a type reference from. Register NewReg = MRI.createGenericVirtualRegister(MRI.getType(Reg)); - if (auto *RC = MRI.getRegClassOrNull(Reg)) { - MRI.setRegClass(NewReg, RC); - } else { - auto RegClass = GR->getRegClass(SpvType); - MRI.setRegClass(NewReg, RegClass); - MRI.setRegClass(Reg, RegClass); - } + const auto *RegClass = GR->getRegClass(SpvType); + MRI.setRegClass(NewReg, RegClass); + MRI.setRegClass(Reg, RegClass); + GR->assignSPIRVTypeToVReg(SpvType, Reg, MIB.getMF()); // This is to make it convenient for Legalizer to get the SPIRVType // when processing the actual MI (i.e. not pseudo one). diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 74aec4f..2b34f61 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -359,18 +359,15 @@ static void lowerExpectAssume(IntrinsicInst *II) { } } -static bool toSpvOverloadedIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID, - ArrayRef<unsigned> OpNos) { - Function *F = nullptr; - if (OpNos.empty()) { - F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID); - } else { - SmallVector<Type *, 4> Tys; - for (unsigned OpNo : OpNos) - Tys.push_back(II->getOperand(OpNo)->getType()); - F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID, Tys); - } - II->setCalledFunction(F); +static bool toSpvLifetimeIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID) { + IRBuilder<> Builder(II); + auto *Alloca = cast<AllocaInst>(II->getArgOperand(0)); + std::optional<TypeSize> Size = + Alloca->getAllocationSize(Alloca->getDataLayout()); + Value *SizeVal = Builder.getInt64(Size ? *Size : -1); + Builder.CreateIntrinsic(NewID, Alloca->getType(), + {SizeVal, II->getArgOperand(0)}); + II->eraseFromParent(); return true; } @@ -406,8 +403,8 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { break; case Intrinsic::lifetime_start: if (!STI.isShader()) { - Changed |= toSpvOverloadedIntrinsic( - II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1}); + Changed |= toSpvLifetimeIntrinsic( + II, Intrinsic::SPVIntrinsics::spv_lifetime_start); } else { II->eraseFromParent(); Changed = true; @@ -415,8 +412,8 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { break; case Intrinsic::lifetime_end: if (!STI.isShader()) { - Changed |= toSpvOverloadedIntrinsic( - II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1}); + Changed |= toSpvLifetimeIntrinsic( + II, Intrinsic::SPVIntrinsics::spv_lifetime_end); } else { II->eraseFromParent(); Changed = true; diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp index cdf3c62..690493fb 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp @@ -166,7 +166,13 @@ void SPIRVSubtarget::initAvailableExtInstSets() { void SPIRVSubtarget::initAvailableExtensions( const std::set<SPIRV::Extension::Extension> &AllowedExtIds) { AvailableExtensions.clear(); - AvailableExtensions.insert_range(AllowedExtIds); + const std::set<SPIRV::Extension::Extension> &ValidExtensions = + SPIRVExtensionsParser::getValidExtensions(TargetTriple); + + for (const auto &Ext : AllowedExtIds) { + if (ValidExtensions.count(Ext)) + AvailableExtensions.insert(Ext); + } accountForAMDShaderTrinaryMinmax(); } diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 614e83a..d2824ee 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -110,22 +110,58 @@ def CapabilityEntries : GenericTable { } //===----------------------------------------------------------------------===// +// Lookup table for matching symbolic operands (category + 32-bit value) to +// SPIR-V environments. If an operand is allows in more than one environment, +// there will be multiple consecutive entries present in the table. +//===----------------------------------------------------------------------===// + +// Forward-declare classes used in ExtensionEntry +class Environment; + +class EnvironmentEntry<OperandCategory category, bits<32> value, + Environment allowedEnvironment> { + OperandCategory Category = category; + bits<32> Value = value; + Environment AllowedEnvironment = allowedEnvironment; +} + +def EnvironmentEntries : GenericTable { + let FilterClass = "EnvironmentEntry"; + let Fields = ["Category", "Value", "AllowedEnvironment"]; + string TypeOf_Category = "OperandCategory"; + string TypeOf_AllowedEnvironment = "Environment"; + let PrimaryKey = ["Category", "Value"]; + // Function for looking up a (the first) environment by category + value. Next + // environment should be consecutive. + let PrimaryKeyName = "lookupEnvironmentByCategoryAndValue"; +} + +//===----------------------------------------------------------------------===// // Multiclass used to define a SymbolicOperand and at the same time declare // required extension and capabilities. //===----------------------------------------------------------------------===// -multiclass SymbolicOperandWithRequirements<OperandCategory category, bits<32> value, string mnemonic, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> { - assert !ge(!size(mnemonic), 1), "No mnemonic/string representation provided for symbolic operand with value " # value; - def : SymbolicOperand<category, value, mnemonic, minVersion, maxVersion>; +multiclass SymbolicOperandWithRequirements< + OperandCategory category, bits<32> value, string mnemonic, + bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, + list<Capability> reqCapabilities, list<Environment> allowedEnvironments> { + assert !ge(!size(mnemonic), 1), "No mnemonic/string representation provided " + "for symbolic operand with value "#value; + def : SymbolicOperand<category, value, mnemonic, minVersion, maxVersion>; + + assert !le(!size(reqExtensions), 1), + "Too many required extensions for a symbolic/named operand: "#mnemonic; + if !eq(!size(reqExtensions), 1) then { + def : ExtensionEntry<category, value, reqExtensions[0]>; + } - assert !le(!size(reqExtensions), 1), "Too many required extensions for a symbolic/named operand: " # mnemonic; - if !eq(!size(reqExtensions), 1) then { - def : ExtensionEntry<category, value, reqExtensions[0]>; - } + foreach capability = reqCapabilities in { + def : CapabilityEntry<category, value, capability>; + } - foreach capability = reqCapabilities in { - def : CapabilityEntry<category, value, capability>; - } + foreach environment = allowedEnvironments in { + def : EnvironmentEntry<category, value, environment>; + } } //===----------------------------------------------------------------------===// @@ -176,6 +212,20 @@ def SpecConstantOpOperandsOperand : OperandCategory; def MatrixMultiplyAccumulateOperandsOperand : OperandCategory; //===----------------------------------------------------------------------===// +// Definition of the Environments +//===----------------------------------------------------------------------===// + +def Environment : GenericEnum, Operand<i32> { + let FilterClass = "Environment"; + let ValueField = "Value"; +} + +class Environment<bits<32> value> { bits<32> Value = value; } + +def EnvOpenCL : Environment<0>; +def EnvVulkan : Environment<1>; + +//===----------------------------------------------------------------------===// // Multiclass used to define Extesions enum values and at the same time // SymbolicOperand entries. //===----------------------------------------------------------------------===// @@ -192,135 +242,146 @@ class Extension<string name, bits<32> value> { bits<32> Value = value; } -multiclass ExtensionOperand<bits<32> value> { +multiclass ExtensionOperand<bits<32> value, + list<Environment> allowedEnvironments> { def NAME : Extension<NAME, value>; - defm : SymbolicOperandWithRequirements<ExtensionOperand, value, NAME, 0, 0, [], []>; -} - -defm SPV_AMD_shader_explicit_vertex_parameter : ExtensionOperand<1>; -defm SPV_AMD_shader_trinary_minmax_extension : ExtensionOperand<2>; -defm SPV_AMD_gcn_shader : ExtensionOperand<3>; -defm SPV_KHR_shader_ballot : ExtensionOperand<4>; -defm SPV_AMD_shader_ballot : ExtensionOperand<5>; -defm SPV_AMD_gpu_shader_half_float : ExtensionOperand<6>; -defm SPV_KHR_shader_draw_parameters : ExtensionOperand<7>; -defm SPV_KHR_subgroup_vote : ExtensionOperand<8>; -defm SPV_KHR_16bit_storage : ExtensionOperand<9>; -defm SPV_KHR_device_group : ExtensionOperand<10>; -defm SPV_KHR_multiview : ExtensionOperand<11>; -defm SPV_NVX_multiview_per_view_attributes : ExtensionOperand<12>; -defm SPV_NV_viewport_array2 : ExtensionOperand<13>; -defm SPV_NV_stereo_view_rendering : ExtensionOperand<14>; -defm SPV_NV_sample_mask_override_coverage : ExtensionOperand<15>; -defm SPV_NV_geometry_shader_passthrough : ExtensionOperand<16>; -defm SPV_AMD_texture_gather_bias_lod : ExtensionOperand<17>; -defm SPV_KHR_storage_buffer_storage_class : ExtensionOperand<18>; -defm SPV_KHR_variable_pointers : ExtensionOperand<19>; -defm SPV_AMD_gpu_shader_int16 : ExtensionOperand<20>; -defm SPV_KHR_post_depth_coverage : ExtensionOperand<21>; -defm SPV_KHR_shader_atomic_counter_ops : ExtensionOperand<22>; -defm SPV_EXT_shader_stencil_export : ExtensionOperand<23>; -defm SPV_EXT_shader_viewport_index_layer : ExtensionOperand<24>; -defm SPV_AMD_shader_image_load_store_lod : ExtensionOperand<25>; -defm SPV_AMD_shader_fragment_mask : ExtensionOperand<26>; -defm SPV_EXT_fragment_fully_covered : ExtensionOperand<27>; -defm SPV_AMD_gpu_shader_half_float_fetch : ExtensionOperand<28>; -defm SPV_GOOGLE_decorate_string : ExtensionOperand<29>; -defm SPV_GOOGLE_hlsl_functionality1 : ExtensionOperand<30>; -defm SPV_NV_shader_subgroup_partitioned : ExtensionOperand<31>; -defm SPV_EXT_descriptor_indexing : ExtensionOperand<32>; -defm SPV_KHR_8bit_storage : ExtensionOperand<33>; -defm SPV_KHR_vulkan_memory_model : ExtensionOperand<34>; -defm SPV_NV_ray_tracing : ExtensionOperand<35>; -defm SPV_NV_compute_shader_derivatives : ExtensionOperand<36>; -defm SPV_NV_fragment_shader_barycentric : ExtensionOperand<37>; -defm SPV_NV_mesh_shader : ExtensionOperand<38>; -defm SPV_NV_shader_image_footprint : ExtensionOperand<39>; -defm SPV_NV_shading_rate : ExtensionOperand<40>; -defm SPV_INTEL_subgroups : ExtensionOperand<41>; -defm SPV_INTEL_media_block_io : ExtensionOperand<42>; -defm SPV_EXT_fragment_invocation_density : ExtensionOperand<44>; -defm SPV_KHR_no_integer_wrap_decoration : ExtensionOperand<45>; -defm SPV_KHR_float_controls : ExtensionOperand<46>; -defm SPV_EXT_physical_storage_buffer : ExtensionOperand<47>; -defm SPV_INTEL_fpga_memory_attributes : ExtensionOperand<48>; -defm SPV_NV_cooperative_matrix : ExtensionOperand<49>; -defm SPV_INTEL_shader_integer_functions2 : ExtensionOperand<50>; -defm SPV_INTEL_fpga_loop_controls : ExtensionOperand<51>; -defm SPV_EXT_fragment_shader_interlock : ExtensionOperand<52>; -defm SPV_NV_shader_sm_builtins : ExtensionOperand<53>; -defm SPV_KHR_shader_clock : ExtensionOperand<54>; -defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55>; -defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56>; -defm SPV_INTEL_fpga_reg : ExtensionOperand<57>; -defm SPV_INTEL_blocking_pipes : ExtensionOperand<58>; -defm SPV_GOOGLE_user_type : ExtensionOperand<59>; -defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60>; -defm SPV_INTEL_kernel_attributes : ExtensionOperand<61>; -defm SPV_KHR_non_semantic_info : ExtensionOperand<62>; -defm SPV_INTEL_io_pipes : ExtensionOperand<63>; -defm SPV_KHR_ray_tracing : ExtensionOperand<64>; -defm SPV_KHR_ray_query : ExtensionOperand<65>; -defm SPV_INTEL_fpga_memory_accesses : ExtensionOperand<66>; -defm SPV_INTEL_arbitrary_precision_integers : ExtensionOperand<67>; -defm SPV_EXT_shader_atomic_float_add : ExtensionOperand<68>; -defm SPV_KHR_terminate_invocation : ExtensionOperand<69>; -defm SPV_KHR_fragment_shading_rate : ExtensionOperand<70>; -defm SPV_EXT_shader_image_int64 : ExtensionOperand<71>; -defm SPV_INTEL_fp_fast_math_mode : ExtensionOperand<72>; -defm SPV_INTEL_fpga_cluster_attributes : ExtensionOperand<73>; -defm SPV_INTEL_loop_fuse : ExtensionOperand<74>; -defm SPV_EXT_shader_atomic_float_min_max : ExtensionOperand<75>; -defm SPV_KHR_workgroup_memory_explicit_layout : ExtensionOperand<76>; -defm SPV_KHR_linkonce_odr : ExtensionOperand<77>; -defm SPV_KHR_expect_assume : ExtensionOperand<78>; -defm SPV_INTEL_fpga_dsp_control : ExtensionOperand<79>; -defm SPV_NV_bindless_texture : ExtensionOperand<80>; -defm SPV_INTEL_fpga_invocation_pipelining_attributes : ExtensionOperand<81>; -defm SPV_KHR_subgroup_uniform_control_flow : ExtensionOperand<82>; -defm SPV_HUAWEI_subpass_shading : ExtensionOperand<83>; -defm SPV_KHR_integer_dot_product : ExtensionOperand<84>; -defm SPV_EXT_shader_atomic_float16_add : ExtensionOperand<85>; -defm SPV_INTEL_runtime_aligned : ExtensionOperand<86>; -defm SPV_KHR_bit_instructions : ExtensionOperand<87>; -defm SPV_NV_ray_tracing_motion_blur : ExtensionOperand<88>; -defm SPV_KHR_uniform_group_instructions : ExtensionOperand<89>; -defm SPV_KHR_subgroup_rotate : ExtensionOperand<90>; -defm SPV_INTEL_split_barrier : ExtensionOperand<91>; -defm SPV_KHR_ray_cull_mask : ExtensionOperand<92>; -defm SPV_KHR_fragment_shader_barycentric : ExtensionOperand<93>; -defm SPV_EXT_relaxed_printf_string_address_space : ExtensionOperand<94>; -defm SPV_EXT_ycbcr_attachments : ExtensionOperand<95>; -defm SPV_EXT_mesh_shader : ExtensionOperand<96>; -defm SPV_ARM_core_builtins : ExtensionOperand<97>; -defm SPV_EXT_opacity_micromap : ExtensionOperand<98>; -defm SPV_NV_shader_invocation_reorder : ExtensionOperand<99>; -defm SPV_INTEL_usm_storage_classes : ExtensionOperand<100>; -defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101>; -defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102>; -defm SPV_INTEL_optnone : ExtensionOperand<103>; -defm SPV_INTEL_function_pointers : ExtensionOperand<104>; -defm SPV_INTEL_variable_length_array : ExtensionOperand<105>; -defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106>; -defm SPV_INTEL_inline_assembly : ExtensionOperand<107>; -defm SPV_INTEL_cache_controls : ExtensionOperand<108>; -defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109>; -defm SPV_INTEL_global_variable_fpga_decorations : ExtensionOperand<110>; -defm SPV_KHR_cooperative_matrix : ExtensionOperand<111>; -defm SPV_EXT_arithmetic_fence : ExtensionOperand<112>; -defm SPV_EXT_optnone : ExtensionOperand<113>; -defm SPV_INTEL_joint_matrix : ExtensionOperand<114>; -defm SPV_INTEL_float_controls2 : ExtensionOperand<115>; -defm SPV_INTEL_bindless_images : ExtensionOperand<116>; -defm SPV_INTEL_long_composites : ExtensionOperand<117>; -defm SPV_INTEL_memory_access_aliasing : ExtensionOperand<118>; -defm SPV_INTEL_fp_max_error : ExtensionOperand<119>; -defm SPV_INTEL_ternary_bitwise_function : ExtensionOperand<120>; -defm SPV_INTEL_subgroup_matrix_multiply_accumulate : ExtensionOperand<121>; -defm SPV_INTEL_2d_block_io : ExtensionOperand<122>; -defm SPV_INTEL_int4 : ExtensionOperand<123>; -defm SPV_KHR_float_controls2 : ExtensionOperand<124>; -defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125>; + defm : SymbolicOperandWithRequirements<ExtensionOperand, value, NAME, 0, + 0, [], [], allowedEnvironments>; +} + +defm SPV_AMD_shader_explicit_vertex_parameter + : ExtensionOperand<1, [EnvVulkan]>; +defm SPV_AMD_shader_trinary_minmax_extension : ExtensionOperand<2, [EnvVulkan]>; +defm SPV_AMD_gcn_shader : ExtensionOperand<3, [EnvVulkan]>; +defm SPV_KHR_shader_ballot : ExtensionOperand<4, [EnvVulkan]>; +defm SPV_AMD_shader_ballot : ExtensionOperand<5, [EnvVulkan]>; +defm SPV_AMD_gpu_shader_half_float : ExtensionOperand<6, [EnvVulkan]>; +defm SPV_KHR_shader_draw_parameters : ExtensionOperand<7, [EnvVulkan]>; +defm SPV_KHR_subgroup_vote : ExtensionOperand<8, [EnvVulkan]>; +defm SPV_KHR_16bit_storage : ExtensionOperand<9, [EnvVulkan]>; +defm SPV_KHR_device_group : ExtensionOperand<10, [EnvVulkan]>; +defm SPV_KHR_multiview : ExtensionOperand<11, [EnvVulkan]>; +defm SPV_NVX_multiview_per_view_attributes : ExtensionOperand<12, [EnvVulkan]>; +defm SPV_NV_viewport_array2 : ExtensionOperand<13, [EnvVulkan]>; +defm SPV_NV_stereo_view_rendering : ExtensionOperand<14, [EnvVulkan]>; +defm SPV_NV_sample_mask_override_coverage : ExtensionOperand<15, [EnvVulkan]>; +defm SPV_NV_geometry_shader_passthrough : ExtensionOperand<16, [EnvVulkan]>; +defm SPV_AMD_texture_gather_bias_lod : ExtensionOperand<17, [EnvVulkan]>; +defm SPV_KHR_storage_buffer_storage_class : ExtensionOperand<18, [EnvVulkan]>; +defm SPV_KHR_variable_pointers : ExtensionOperand<19, [EnvVulkan]>; +defm SPV_AMD_gpu_shader_int16 : ExtensionOperand<20, [EnvVulkan]>; +defm SPV_KHR_post_depth_coverage : ExtensionOperand<21, [EnvVulkan]>; +defm SPV_KHR_shader_atomic_counter_ops : ExtensionOperand<22, []>; +defm SPV_EXT_shader_stencil_export : ExtensionOperand<23, [EnvVulkan]>; +defm SPV_EXT_shader_viewport_index_layer : ExtensionOperand<24, [EnvVulkan]>; +defm SPV_AMD_shader_image_load_store_lod : ExtensionOperand<25, [EnvVulkan]>; +defm SPV_AMD_shader_fragment_mask : ExtensionOperand<26, [EnvVulkan]>; +defm SPV_EXT_fragment_fully_covered : ExtensionOperand<27, [EnvVulkan]>; +defm SPV_AMD_gpu_shader_half_float_fetch : ExtensionOperand<28, [EnvVulkan]>; +defm SPV_GOOGLE_decorate_string : ExtensionOperand<29, [EnvVulkan]>; +defm SPV_GOOGLE_hlsl_functionality1 : ExtensionOperand<30, [EnvVulkan]>; +defm SPV_NV_shader_subgroup_partitioned : ExtensionOperand<31, [EnvVulkan]>; +defm SPV_EXT_descriptor_indexing : ExtensionOperand<32, [EnvVulkan]>; +defm SPV_KHR_8bit_storage : ExtensionOperand<33, [EnvVulkan]>; +defm SPV_KHR_vulkan_memory_model : ExtensionOperand<34, [EnvVulkan]>; +defm SPV_NV_ray_tracing : ExtensionOperand<35, [EnvVulkan]>; +defm SPV_NV_compute_shader_derivatives : ExtensionOperand<36, [EnvVulkan]>; +defm SPV_NV_fragment_shader_barycentric : ExtensionOperand<37, [EnvVulkan]>; +defm SPV_NV_mesh_shader : ExtensionOperand<38, [EnvVulkan]>; +defm SPV_NV_shader_image_footprint : ExtensionOperand<39, [EnvVulkan]>; +defm SPV_NV_shading_rate : ExtensionOperand<40, [EnvVulkan]>; +defm SPV_INTEL_subgroups : ExtensionOperand<41, [EnvOpenCL]>; +defm SPV_INTEL_media_block_io : ExtensionOperand<42, [EnvOpenCL]>; +defm SPV_EXT_fragment_invocation_density : ExtensionOperand<44, [EnvVulkan]>; +defm SPV_KHR_no_integer_wrap_decoration : ExtensionOperand<45, [EnvOpenCL]>; +defm SPV_KHR_float_controls : ExtensionOperand<46, [EnvVulkan, EnvOpenCL]>; +defm SPV_EXT_physical_storage_buffer : ExtensionOperand<47, [EnvVulkan]>; +defm SPV_INTEL_fpga_memory_attributes : ExtensionOperand<48, [EnvOpenCL]>; +defm SPV_NV_cooperative_matrix : ExtensionOperand<49, [EnvVulkan]>; +defm SPV_INTEL_shader_integer_functions2 + : ExtensionOperand<50, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_fpga_loop_controls : ExtensionOperand<51, [EnvOpenCL]>; +defm SPV_EXT_fragment_shader_interlock : ExtensionOperand<52, [EnvVulkan]>; +defm SPV_NV_shader_sm_builtins : ExtensionOperand<53, [EnvVulkan]>; +defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>; +defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>; +defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>; +defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>; +defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>; +defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>; +defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>; +defm SPV_KHR_non_semantic_info : ExtensionOperand<62, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_io_pipes : ExtensionOperand<63, [EnvOpenCL]>; +defm SPV_KHR_ray_tracing : ExtensionOperand<64, [EnvVulkan]>; +defm SPV_KHR_ray_query : ExtensionOperand<65, [EnvVulkan]>; +defm SPV_INTEL_fpga_memory_accesses : ExtensionOperand<66, [EnvOpenCL]>; +defm SPV_INTEL_arbitrary_precision_integers : ExtensionOperand<67, [EnvOpenCL]>; +defm SPV_EXT_shader_atomic_float_add + : ExtensionOperand<68, [EnvVulkan, EnvOpenCL]>; +defm SPV_KHR_terminate_invocation : ExtensionOperand<69, [EnvVulkan]>; +defm SPV_KHR_fragment_shading_rate : ExtensionOperand<70, [EnvVulkan]>; +defm SPV_EXT_shader_image_int64 : ExtensionOperand<71, [EnvVulkan]>; +defm SPV_INTEL_fp_fast_math_mode : ExtensionOperand<72, [EnvOpenCL]>; +defm SPV_INTEL_fpga_cluster_attributes : ExtensionOperand<73, [EnvOpenCL]>; +defm SPV_INTEL_loop_fuse : ExtensionOperand<74, [EnvOpenCL]>; +defm SPV_EXT_shader_atomic_float_min_max + : ExtensionOperand<75, [EnvVulkan, EnvOpenCL]>; +defm SPV_KHR_workgroup_memory_explicit_layout + : ExtensionOperand<76, [EnvVulkan]>; +defm SPV_KHR_linkonce_odr : ExtensionOperand<77, [EnvOpenCL]>; +defm SPV_KHR_expect_assume : ExtensionOperand<78, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_fpga_dsp_control : ExtensionOperand<79, [EnvOpenCL]>; +defm SPV_NV_bindless_texture : ExtensionOperand<80, [EnvVulkan]>; +defm SPV_INTEL_fpga_invocation_pipelining_attributes + : ExtensionOperand<81, [EnvOpenCL]>; +defm SPV_KHR_subgroup_uniform_control_flow : ExtensionOperand<82, [EnvVulkan]>; +defm SPV_HUAWEI_subpass_shading : ExtensionOperand<83, [EnvVulkan]>; +defm SPV_KHR_integer_dot_product : ExtensionOperand<84, [EnvVulkan, EnvOpenCL]>; +defm SPV_EXT_shader_atomic_float16_add + : ExtensionOperand<85, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_runtime_aligned : ExtensionOperand<86, [EnvOpenCL]>; +defm SPV_KHR_bit_instructions : ExtensionOperand<87, [EnvOpenCL]>; +defm SPV_NV_ray_tracing_motion_blur : ExtensionOperand<88, [EnvVulkan]>; +defm SPV_KHR_uniform_group_instructions : ExtensionOperand<89, [EnvOpenCL]>; +defm SPV_KHR_subgroup_rotate : ExtensionOperand<90, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_split_barrier : ExtensionOperand<91, [EnvOpenCL]>; +defm SPV_KHR_ray_cull_mask : ExtensionOperand<92, [EnvVulkan]>; +defm SPV_KHR_fragment_shader_barycentric : ExtensionOperand<93, [EnvVulkan]>; +defm SPV_EXT_relaxed_printf_string_address_space + : ExtensionOperand<94, [EnvOpenCL]>; +defm SPV_EXT_mesh_shader : ExtensionOperand<96, [EnvVulkan]>; +defm SPV_ARM_core_builtins : ExtensionOperand<97, [EnvVulkan]>; +defm SPV_EXT_opacity_micromap : ExtensionOperand<98, [EnvVulkan]>; +defm SPV_NV_shader_invocation_reorder : ExtensionOperand<99, [EnvVulkan]>; +defm SPV_INTEL_usm_storage_classes : ExtensionOperand<100, [EnvOpenCL]>; +defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101, [EnvOpenCL]>; +defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102, [EnvOpenCL]>; +defm SPV_INTEL_optnone : ExtensionOperand<103, [EnvOpenCL]>; +defm SPV_INTEL_function_pointers : ExtensionOperand<104, [EnvOpenCL]>; +defm SPV_INTEL_variable_length_array : ExtensionOperand<105, [EnvOpenCL]>; +defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106, [EnvOpenCL]>; +defm SPV_INTEL_inline_assembly : ExtensionOperand<107, [EnvOpenCL]>; +defm SPV_INTEL_cache_controls : ExtensionOperand<108, [EnvOpenCL]>; +defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109, [EnvOpenCL]>; +defm SPV_INTEL_global_variable_fpga_decorations + : ExtensionOperand<110, [EnvOpenCL]>; +defm SPV_KHR_cooperative_matrix : ExtensionOperand<111, [EnvVulkan, EnvOpenCL]>; +defm SPV_EXT_arithmetic_fence : ExtensionOperand<112, [EnvOpenCL]>; +defm SPV_EXT_optnone : ExtensionOperand<113, [EnvOpenCL]>; +defm SPV_INTEL_joint_matrix : ExtensionOperand<114, [EnvOpenCL]>; +defm SPV_INTEL_float_controls2 : ExtensionOperand<115, [EnvOpenCL]>; +defm SPV_INTEL_bindless_images : ExtensionOperand<116, [EnvOpenCL]>; +defm SPV_INTEL_long_composites : ExtensionOperand<117, [EnvOpenCL]>; +defm SPV_INTEL_memory_access_aliasing : ExtensionOperand<118, [EnvOpenCL]>; +defm SPV_INTEL_fp_max_error : ExtensionOperand<119, [EnvOpenCL]>; +defm SPV_INTEL_ternary_bitwise_function : ExtensionOperand<120, [EnvOpenCL]>; +defm SPV_INTEL_subgroup_matrix_multiply_accumulate + : ExtensionOperand<121, [EnvOpenCL]>; +defm SPV_INTEL_2d_block_io : ExtensionOperand<122, [EnvOpenCL]>; +defm SPV_INTEL_int4 : ExtensionOperand<123, [EnvOpenCL]>; +defm SPV_KHR_float_controls2 : ExtensionOperand<124, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>; //===----------------------------------------------------------------------===// // Multiclass used to define Capabilities enum values and at the same time @@ -342,7 +403,9 @@ class Capability<string name, bits<32> value> { multiclass CapabilityOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> { def NAME : Capability<NAME, value>; - defm : SymbolicOperandWithRequirements<CapabilityOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>; + defm : SymbolicOperandWithRequirements<CapabilityOperand, value, NAME, + minVersion, maxVersion, reqExtensions, + reqCapabilities, []>; } defm Matrix : CapabilityOperand<0, 0, 0, [], []>; @@ -551,7 +614,8 @@ class SourceLanguage<string name, bits<32> value> { multiclass SourceLanguageOperand<bits<32> value> { def : SourceLanguage<NAME, value>; - defm : SymbolicOperandWithRequirements<SourceLanguageOperand, value, NAME, 0, 0, [], []>; + defm : SymbolicOperandWithRequirements<SourceLanguageOperand, value, NAME, 0, + 0, [], [], []>; } defm Unknown : SourceLanguageOperand<0>; @@ -580,7 +644,8 @@ class AddressingModel<string name, bits<32> value> { multiclass AddressingModelOperand<bits<32> value, list<Capability> reqCapabilities> { def : AddressingModel<NAME, value>; - defm : SymbolicOperandWithRequirements<AddressingModelOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<AddressingModelOperand, value, NAME, 0, + 0, [], reqCapabilities, []>; } defm Logical : AddressingModelOperand<0, []>; @@ -607,7 +672,8 @@ class ExecutionModel<string name, bits<32> value> { multiclass ExecutionModelOperand<bits<32> value, list<Capability> reqCapabilities> { def : ExecutionModel<NAME, value>; - defm : SymbolicOperandWithRequirements<ExecutionModelOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<ExecutionModelOperand, value, NAME, 0, + 0, [], reqCapabilities, []>; } defm Vertex : ExecutionModelOperand<0, [Shader]>; @@ -645,7 +711,8 @@ class MemoryModel<string name, bits<32> value> { multiclass MemoryModelOperand<bits<32> value, list<Capability> reqCapabilities> { def : MemoryModel<NAME, value>; - defm : SymbolicOperandWithRequirements<MemoryModelOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<MemoryModelOperand, value, NAME, 0, + 0, [], reqCapabilities, []>; } defm Simple : MemoryModelOperand<0, [Shader]>; @@ -672,7 +739,8 @@ class ExecutionMode<string name, bits<32> value> { multiclass ExecutionModeOperand<bits<32> value, list<Capability> reqCapabilities> { def : ExecutionMode<NAME, value>; - defm : SymbolicOperandWithRequirements<ExecutionModeOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<ExecutionModeOperand, value, NAME, 0, + 0, [], reqCapabilities, []>; } defm Invocations : ExecutionModeOperand<0, [Geometry]>; @@ -748,7 +816,8 @@ class StorageClass<string name, bits<32> value> { multiclass StorageClassOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> { def : StorageClass<NAME, value>; - defm : SymbolicOperandWithRequirements<StorageClassOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>; + defm : SymbolicOperandWithRequirements<StorageClassOperand, value, NAME, 0, 0, + reqExtensions, reqCapabilities, []>; } defm UniformConstant : StorageClassOperand<0, [], []>; @@ -794,7 +863,8 @@ class Dim<string name, bits<32> value> { multiclass DimOperand<bits<32> value, string mnemonic, list<Capability> reqCapabilities> { def NAME : Dim<NAME, value>; - defm : SymbolicOperandWithRequirements<DimOperand, value, mnemonic, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<DimOperand, value, mnemonic, 0, 0, [], + reqCapabilities, []>; } defm DIM_1D : DimOperand<0, "1D", [Sampled1D, Image1D]>; @@ -824,7 +894,8 @@ class SamplerAddressingMode<string name, bits<32> value> { multiclass SamplerAddressingModeOperand<bits<32> value, list<Capability> reqCapabilities> { def : SamplerAddressingMode<NAME, value>; - defm : SymbolicOperandWithRequirements<SamplerAddressingModeOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<SamplerAddressingModeOperand, value, + NAME, 0, 0, [], reqCapabilities, []>; } defm None : SamplerAddressingModeOperand<0, [Kernel]>; @@ -852,7 +923,8 @@ class SamplerFilterMode<string name, bits<32> value> { multiclass SamplerFilterModeOperand<bits<32> value, list<Capability> reqCapabilities> { def : SamplerFilterMode<NAME, value>; - defm : SymbolicOperandWithRequirements<SamplerFilterModeOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<SamplerFilterModeOperand, value, NAME, + 0, 0, [], reqCapabilities, []>; } defm Nearest : SamplerFilterModeOperand<0, [Kernel]>; @@ -877,7 +949,8 @@ class ImageFormat<string name, bits<32> value> { multiclass ImageFormatOperand<bits<32> value, list<Capability> reqCapabilities> { def NAME : ImageFormat<NAME, value>; - defm : SymbolicOperandWithRequirements<ImageFormatOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<ImageFormatOperand, value, NAME, 0, + 0, [], reqCapabilities, []>; } defm Unknown : ImageFormatOperand<0, []>; @@ -940,7 +1013,8 @@ class ImageChannelOrder<string name, bits<32> value> { multiclass ImageChannelOrderOperand<bits<32> value, list<Capability> reqCapabilities> { def : ImageChannelOrder<NAME, value>; - defm : SymbolicOperandWithRequirements<ImageChannelOrderOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<ImageChannelOrderOperand, value, NAME, + 0, 0, [], reqCapabilities, []>; } defm R : ImageChannelOrderOperand<0, [Kernel]>; @@ -983,7 +1057,8 @@ class ImageChannelDataType<string name, bits<32> value> { multiclass ImageChannelDataTypeOperand<bits<32> value, list<Capability> reqCapabilities> { def : ImageChannelDataType<NAME, value>; - defm : SymbolicOperandWithRequirements<ImageChannelDataTypeOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<ImageChannelDataTypeOperand, value, + NAME, 0, 0, [], reqCapabilities, []>; } defm SnormInt8 : ImageChannelDataTypeOperand<0, []>; @@ -1023,7 +1098,8 @@ class ImageOperand<string name, bits<32> value> { multiclass ImageOperandOperand<bits<32> value, list<Capability> reqCapabilities> { def : ImageOperand<NAME, value>; - defm : SymbolicOperandWithRequirements<ImageOperandOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<ImageOperandOperand, value, NAME, 0, + 0, [], reqCapabilities, []>; } defm None : ImageOperandOperand<0x0, []>; @@ -1061,7 +1137,8 @@ class FPFastMathMode<string name, bits<32> value> { multiclass FPFastMathModeOperand<bits<32> value, list<Capability> reqCapabilities> { def : FPFastMathMode<NAME, value>; - defm : SymbolicOperandWithRequirements<FPFastMathModeOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<FPFastMathModeOperand, value, NAME, 0, + 0, [], reqCapabilities, []>; } defm None : FPFastMathModeOperand<0x0, []>; @@ -1090,7 +1167,8 @@ class FPRoundingMode<string name, bits<32> value> { multiclass FPRoundingModeOperand<bits<32> value> { def NAME : FPRoundingMode<NAME, value>; - defm : SymbolicOperandWithRequirements<FPRoundingModeOperand, value, NAME, 0, 0, [], []>; + defm : SymbolicOperandWithRequirements<FPRoundingModeOperand, value, NAME, 0, + 0, [], [], []>; } defm RTE : FPRoundingModeOperand<0>; @@ -1117,7 +1195,8 @@ class LinkageType<string name, bits<32> value> { multiclass LinkageTypeOperand<bits<32> value, list<Capability> reqCapabilities> { def : LinkageType<NAME, value>; - defm : SymbolicOperandWithRequirements<LinkageTypeOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<LinkageTypeOperand, value, NAME, 0, + 0, [], reqCapabilities, []>; } defm Export : LinkageTypeOperand<0, [Linkage]>; @@ -1143,7 +1222,8 @@ class AccessQualifier<string name, bits<32> value> { multiclass AccessQualifierOperand<bits<32> value, list<Capability> reqCapabilities> { def NAME : AccessQualifier<NAME, value>; - defm : SymbolicOperandWithRequirements<AccessQualifierOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<AccessQualifierOperand, value, NAME, 0, + 0, [], reqCapabilities, []>; } defm ReadOnly : AccessQualifierOperand<0, [Kernel]>; @@ -1170,7 +1250,9 @@ class FunctionParameterAttribute<string name, bits<32> value> { multiclass FunctionParameterAttributeOperand<bits<32> value, list<Capability> reqCapabilities> { def : FunctionParameterAttribute<NAME, value>; - defm : SymbolicOperandWithRequirements<FunctionParameterAttributeOperand, value, NAME, 0, 0, [], reqCapabilities>; + defm : SymbolicOperandWithRequirements<FunctionParameterAttributeOperand, + value, NAME, 0, 0, [], + reqCapabilities, []>; } defm Zext : FunctionParameterAttributeOperand<0, [Kernel]>; @@ -1202,7 +1284,9 @@ class Decoration<string name, bits<32> value> { multiclass DecorationOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> { def : Decoration<NAME, value>; - defm : SymbolicOperandWithRequirements<DecorationOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>; + defm : SymbolicOperandWithRequirements<DecorationOperand, value, NAME, + minVersion, maxVersion, reqExtensions, + reqCapabilities, []>; } defm RelaxedPrecision : DecorationOperand<0, 0, 0, [], [Shader]>; @@ -1303,7 +1387,9 @@ class BuiltIn<string name, bits<32> value> { multiclass BuiltInOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> { def NAME : BuiltIn<NAME, value>; - defm : SymbolicOperandWithRequirements<BuiltInOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>; + defm : SymbolicOperandWithRequirements<BuiltInOperand, value, NAME, + minVersion, maxVersion, reqExtensions, + reqCapabilities, []>; } defm Position : BuiltInOperand<0, 0, 0, [], [Shader]>; @@ -1417,7 +1503,8 @@ class SelectionControl<string name, bits<32> value> { multiclass SelectionControlOperand<bits<32> value> { def : SelectionControl<NAME, value>; - defm : SymbolicOperandWithRequirements<SelectionControlOperand, value, NAME, 0, 0, [], []>; + defm : SymbolicOperandWithRequirements<SelectionControlOperand, value, NAME, + 0, 0, [], [], []>; } defm None : SelectionControlOperand<0x0>; @@ -1443,7 +1530,8 @@ class LoopControl<string name, bits<32> value> { multiclass LoopControlOperand<bits<32> value> { def : LoopControl<NAME, value>; - defm : SymbolicOperandWithRequirements<LoopControlOperand, value, NAME, 0, 0, [], []>; + defm : SymbolicOperandWithRequirements<LoopControlOperand, value, NAME, 0, + 0, [], [], []>; } defm None : LoopControlOperand<0x0>; @@ -1476,7 +1564,8 @@ class FunctionControl<string name, bits<32> value> { multiclass FunctionControlOperand<bits<32> value> { def : FunctionControl<NAME, value>; - defm : SymbolicOperandWithRequirements<FunctionControlOperand, value, NAME, 0, 0, [], []>; + defm : SymbolicOperandWithRequirements<FunctionControlOperand, value, NAME, 0, + 0, [], [], []>; } defm None : FunctionControlOperand<0x0>; @@ -1506,7 +1595,9 @@ class MemorySemantics<string name, bits<32> value> { multiclass MemorySemanticsOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> { def : MemorySemantics<NAME, value>; - defm : SymbolicOperandWithRequirements<MemorySemanticsOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>; + defm : SymbolicOperandWithRequirements<MemorySemanticsOperand, value, NAME, + minVersion, maxVersion, reqExtensions, + reqCapabilities, []>; } defm None : MemorySemanticsOperand<0x0, 0, 0, [], []>; @@ -1544,7 +1635,9 @@ class MemoryOperand<string name, bits<32> value> { multiclass MemoryOperandOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> { def : MemoryOperand<NAME, value>; - defm : SymbolicOperandWithRequirements<MemoryOperandOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>; + defm : SymbolicOperandWithRequirements<MemoryOperandOperand, value, NAME, + minVersion, maxVersion, reqExtensions, + reqCapabilities, []>; } defm None : MemoryOperandOperand<0x0, 0, 0, [], []>; @@ -1577,7 +1670,9 @@ class Scope<string name, bits<32> value> { multiclass ScopeOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> { def : Scope<NAME, value>; - defm : SymbolicOperandWithRequirements<ScopeOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>; + defm : SymbolicOperandWithRequirements<ScopeOperand, value, NAME, minVersion, + maxVersion, reqExtensions, + reqCapabilities, []>; } defm CrossDevice : ScopeOperand<0, 0, 0, [], []>; @@ -1607,7 +1702,9 @@ class GroupOperation<string name, bits<32> value> { multiclass GroupOperationOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> { def NAME : GroupOperation<NAME, value>; - defm : SymbolicOperandWithRequirements<GroupOperationOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>; + defm : SymbolicOperandWithRequirements<GroupOperationOperand, value, NAME, + minVersion, maxVersion, reqExtensions, + reqCapabilities, []>; } defm Reduce : GroupOperationOperand<0, 0, 0, [], [Kernel, GroupNonUniformArithmetic, GroupNonUniformBallot]>; @@ -1638,7 +1735,9 @@ class KernelEnqueueFlags<string name, bits<32> value> { multiclass KernelEnqueueFlagsOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> { def : KernelEnqueueFlags<NAME, value>; - defm : SymbolicOperandWithRequirements<KernelEnqueueFlagsOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>; + defm : SymbolicOperandWithRequirements<KernelEnqueueFlagsOperand, value, NAME, + minVersion, maxVersion, reqExtensions, + reqCapabilities, []>; } defm NoWait : KernelEnqueueFlagsOperand<0, 0, 0, [], [Kernel]>; @@ -1665,7 +1764,9 @@ class KernelProfilingInfo<string name, bits<32> value> { multiclass KernelProfilingInfoOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> { def : KernelProfilingInfo<NAME, value>; - defm : SymbolicOperandWithRequirements<KernelProfilingInfoOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>; + defm : SymbolicOperandWithRequirements<KernelProfilingInfoOperand, value, + NAME, minVersion, maxVersion, + reqExtensions, reqCapabilities, []>; } defm None : KernelProfilingInfoOperand<0x0, 0, 0, [], []>; @@ -1690,7 +1791,8 @@ class Opcode<string name, bits<32> value> { multiclass OpcodeOperand<bits<32> value> { def : Opcode<NAME, value>; - defm : SymbolicOperandWithRequirements<OpcodeOperand, value, NAME, 0, 0, [], []>; + defm : SymbolicOperandWithRequirements<OpcodeOperand, value, NAME, 0, + 0, [], [], []>; } // TODO: implement other mnemonics. defm InBoundsAccessChain : OpcodeOperand<66>; @@ -1720,7 +1822,9 @@ class CooperativeMatrixLayout<string name, bits<32> value> { multiclass CooperativeMatrixLayoutOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> { def : CooperativeMatrixLayout<NAME, value>; - defm : SymbolicOperandWithRequirements<CooperativeMatrixLayoutOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>; + defm : SymbolicOperandWithRequirements<CooperativeMatrixLayoutOperand, value, + NAME, 0, 0, reqExtensions, + reqCapabilities, []>; } defm RowMajorKHR : CooperativeMatrixLayoutOperand<0x0, [SPV_KHR_cooperative_matrix], [CooperativeMatrixKHR]>; @@ -1747,7 +1851,9 @@ class CooperativeMatrixOperands<string name, bits<32> value> { multiclass CooperativeMatrixOperandsOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> { def : CooperativeMatrixOperands<NAME, value>; - defm : SymbolicOperandWithRequirements<CooperativeMatrixOperandsOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>; + defm : SymbolicOperandWithRequirements<CooperativeMatrixOperandsOperand, + value, NAME, 0, 0, reqExtensions, + reqCapabilities, []>; } defm NoneKHR : CooperativeMatrixOperandsOperand<0x0, [SPV_KHR_cooperative_matrix], [CooperativeMatrixKHR]>; @@ -1780,7 +1886,9 @@ class SpecConstantOpOperands<string name, bits<32> value> { multiclass SpecConstantOpOperandsOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> { def : SpecConstantOpOperands<NAME, value>; - defm : SymbolicOperandWithRequirements<SpecConstantOpOperandsOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>; + defm : SymbolicOperandWithRequirements<SpecConstantOpOperandsOperand, value, + NAME, 0, 0, reqExtensions, + reqCapabilities, []>; } // Conversion @@ -1868,7 +1976,9 @@ class MatrixMultiplyAccumulateOperands<string name, bits<32> value> { multiclass MatrixMultiplyAccumulateOperandsOperand<bits<32> value, list<Extension> reqExtensions> { def : MatrixMultiplyAccumulateOperands<NAME, value>; - defm : SymbolicOperandWithRequirements<MatrixMultiplyAccumulateOperandsOperand, value, NAME, 0, 0, reqExtensions, []>; + defm : SymbolicOperandWithRequirements< + MatrixMultiplyAccumulateOperandsOperand, value, NAME, 0, 0, + reqExtensions, [], []>; } defm None : MatrixMultiplyAccumulateOperandsOperand<0x0, [SPV_INTEL_subgroup_matrix_multiply_accumulate]>; diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index c0fc3a6..dd22132 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -1799,12 +1799,14 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FREM , MVT::f64, Expand); - setOperationAction(ISD::FMA , MVT::f64, Expand); + setOperationAction(ISD::FMA, MVT::f64, + Subtarget->isUA2007() ? Legal : Expand); setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); setOperationAction(ISD::FREM , MVT::f32, Expand); - setOperationAction(ISD::FMA, MVT::f32, Expand); + setOperationAction(ISD::FMA, MVT::f32, + Subtarget->isUA2007() ? Legal : Expand); setOperationAction(ISD::ROTL , MVT::i32, Expand); setOperationAction(ISD::ROTR , MVT::i32, Expand); setOperationAction(ISD::BSWAP, MVT::i32, Expand); @@ -2278,21 +2280,15 @@ SDValue SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain, EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - ArgListEntry Entry; - Entry.Node = Arg; - Entry.Ty = ArgTy; - if (ArgTy->isFP128Ty()) { // Create a stack object and pass the pointer to the library function. int FI = MFI.CreateStackObject(16, Align(8), false); SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - Chain = DAG.getStore(Chain, DL, Entry.Node, FIPtr, MachinePointerInfo(), - Align(8)); - - Entry.Node = FIPtr; - Entry.Ty = PointerType::getUnqual(ArgTy->getContext()); + Chain = DAG.getStore(Chain, DL, Arg, FIPtr, MachinePointerInfo(), Align(8)); + Args.emplace_back(FIPtr, PointerType::getUnqual(ArgTy->getContext())); + } else { + Args.emplace_back(Arg, ArgTy); } - Args.push_back(Entry); return Chain; } @@ -2314,11 +2310,9 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG, if (RetTy->isFP128Ty()) { // Create a Stack Object to receive the return value of type f128. - ArgListEntry Entry; int RetFI = MFI.CreateStackObject(16, Align(8), false); RetPtr = DAG.getFrameIndex(RetFI, PtrVT); - Entry.Node = RetPtr; - Entry.Ty = PointerType::getUnqual(RetTy->getContext()); + ArgListEntry Entry(RetPtr, PointerType::getUnqual(RetTy->getContext())); if (!Subtarget->is64Bit()) { Entry.IsSRet = true; Entry.IndirectType = RetTy; @@ -3550,6 +3544,11 @@ bool SparcTargetLowering::isCheapToSpeculateCttz(Type *Ty) const { return isCheapToSpeculateCtlz(Ty); } +bool SparcTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + EVT VT) const { + return Subtarget->isUA2007() && !Subtarget->useSoftFloat(); +} + // Override to disable global variable loading on Linux. void SparcTargetLowering::insertSSPDeclarations(Module &M) const { if (!Subtarget->isTargetLinux()) diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h index 0d220f8..7fffb7c 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.h +++ b/llvm/lib/Target/Sparc/SparcISelLowering.h @@ -28,6 +28,8 @@ namespace llvm { bool useSoftFloat() const override; + bool softPromoteHalfType() const override { return true; } + /// computeKnownBitsForTargetNode - Determine which of the bits specified /// in Mask are known to be either zero or one and return them in the /// KnownZero/KnownOne bitsets. @@ -177,6 +179,11 @@ namespace llvm { bool isCheapToSpeculateCttz(Type *Ty) const override; + bool enableAggressiveFMAFusion(EVT VT) const override { return true; }; + + bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + EVT VT) const override; + bool shouldInsertFencesForAtomic(const Instruction *I) const override { // FIXME: We insert fences for each atomics and generate // sub-optimal code for PSO/TSO. (Approximately nobody uses any diff --git a/llvm/lib/Target/Sparc/SparcInstrUAOSA.td b/llvm/lib/Target/Sparc/SparcInstrUAOSA.td index 3a30e55..ffd4423 100644 --- a/llvm/lib/Target/Sparc/SparcInstrUAOSA.td +++ b/llvm/lib/Target/Sparc/SparcInstrUAOSA.td @@ -66,3 +66,15 @@ defm CXBCOND : F2_56<"cxb", 1>; def FPMADDX : FourOp<"fpmaddx", 0b110111, 0b0000, DFPRegs>; def FPMADDXHI : FourOp<"fpmaddxhi", 0b110111, 0b0100, DFPRegs>; } // Predicates = [HasOSA2011] + +// UA2007 instruction patterns. +let Predicates = [HasUA2007] in { +def : Pat<(f32 (any_fma f32:$rs1, f32:$rs2, f32:$add)), (FMADDS $rs1, $rs2, $add)>; +def : Pat<(f64 (any_fma f64:$rs1, f64:$rs2, f64:$add)), (FMADDD $rs1, $rs2, $add)>; +def : Pat<(f32 (any_fma f32:$rs1, f32:$rs2, (fneg f32:$sub))), (FMSUBS $rs1, $rs2, $sub)>; +def : Pat<(f64 (any_fma f64:$rs1, f64:$rs2, (fneg f64:$sub))), (FMSUBD $rs1, $rs2, $sub)>; +def : Pat<(f32 (fneg (any_fma f32:$rs1, f32:$rs2, f32:$add))), (FNMADDS $rs1, $rs2, $add)>; +def : Pat<(f64 (fneg (any_fma f64:$rs1, f64:$rs2, f64:$add))), (FNMADDD $rs1, $rs2, $add)>; +def : Pat<(f32 (fneg (any_fma f32:$rs1, f32:$rs2, (fneg f32:$sub)))), (FNMSUBS $rs1, $rs2, $sub)>; +def : Pat<(f64 (fneg (any_fma f64:$rs1, f64:$rs2, (fneg f64:$sub)))), (FNMSUBD $rs1, $rs2, $sub)>; +} // Predicates = [HasUA2007] diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h index fbb98ff..f5ffbf5 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h @@ -29,50 +29,6 @@ namespace SystemZ { extern const MCPhysReg XPLINK64ArgFPRs[XPLINK64NumArgFPRs]; } // end namespace SystemZ -class SystemZCCState : public CCState { -private: - /// Records whether the value was widened from a short vector type. - SmallVector<bool, 4> ArgIsShortVector; - - // Check whether ArgVT is a short vector type. - bool IsShortVectorType(EVT ArgVT) { - return ArgVT.isVector() && ArgVT.getStoreSize() <= 8; - } - -public: - SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, - SmallVectorImpl<CCValAssign> &locs, LLVMContext &C) - : CCState(CC, isVarArg, MF, locs, C) {} - - void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins, - CCAssignFn Fn) { - // Record whether the call operand was a short vector. - ArgIsShortVector.clear(); - for (unsigned i = 0; i < Ins.size(); ++i) - ArgIsShortVector.push_back(IsShortVectorType(Ins[i].ArgVT)); - - CCState::AnalyzeFormalArguments(Ins, Fn); - } - - void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, - CCAssignFn Fn) { - // Record whether the call operand was a short vector. - ArgIsShortVector.clear(); - for (unsigned i = 0; i < Outs.size(); ++i) - ArgIsShortVector.push_back(IsShortVectorType(Outs[i].ArgVT)); - - CCState::AnalyzeCallOperands(Outs, Fn); - } - - // This version of AnalyzeCallOperands in the base class is not usable - // since we must provide a means of accessing ISD::OutputArg::IsShortVector. - void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs, - SmallVectorImpl<ISD::ArgFlagsTy> &Flags, - CCAssignFn Fn) = delete; - - bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; } -}; - // Handle i128 argument types. These need to be passed by implicit // reference. This could be as simple as the following .td line: // CCIfType<[i128], CCPassIndirect<i64>>, diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index 059f31f..2795de5 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -18,7 +18,7 @@ class CCIfSubtarget<string F, CCAction A> // Match if this specific argument was widened from a short vector type. class CCIfShortVector<CCAction A> - : CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>; + : CCIf<"OrigTy->isVectorTy() && OrigTy->getPrimitiveSizeInBits() <= 64", A>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 5ee66e3..dcefff9 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -20,7 +20,6 @@ #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Module.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index fb0a47d..c73dc30 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1941,7 +1941,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments( // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ); FuncInfo->setSizeOfFnParams(CCInfo.getStackSize()); @@ -2251,7 +2251,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, // Analyze the operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, Ctx); + CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, Ctx); ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ); // We don't support GuaranteedTailCallOpt, only automatically-detected @@ -2460,10 +2460,9 @@ std::pair<SDValue, SDValue> SystemZTargetLowering::makeExternalCall( TargetLowering::ArgListTy Args; Args.reserve(Ops.size()); - TargetLowering::ArgListEntry Entry; for (SDValue Op : Ops) { - Entry.Node = Op; - Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); + TargetLowering::ArgListEntry Entry( + Op, Op.getValueType().getTypeForEVT(*DAG.getContext())); Entry.IsSExt = shouldSignExtendTypeInLibCall(Entry.Ty, IsSigned); Entry.IsZExt = !Entry.IsSExt; Args.push_back(Entry); diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 1866962..707887c 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -523,7 +523,7 @@ public: bool MathUsed) const override { // Form add and sub with overflow intrinsics regardless of any extra // users of the math result. - return VT == MVT::i32 || VT == MVT::i64; + return VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i128; } bool shouldConsiderGEPOffsetSplit() const override { return true; } diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index f32c9bd..2611c29 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -436,20 +436,6 @@ bool SystemZTTIImpl::isLSRCostLess( C2.ScaleCost, C2.SetupCost); } -bool SystemZTTIImpl::areInlineCompatible(const Function *Caller, - const Function *Callee) const { - const TargetMachine &TM = getTLI()->getTargetMachine(); - - const FeatureBitset &CallerBits = - TM.getSubtargetImpl(*Caller)->getFeatureBits(); - const FeatureBitset &CalleeBits = - TM.getSubtargetImpl(*Callee)->getFeatureBits(); - - // Support only equal feature bitsets. Restriction should be relaxed in the - // future to allow inlining when callee's bits are subset of the caller's. - return CallerBits == CalleeBits; -} - unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const { bool Vector = (ClassID == 1); if (!Vector) diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index dc5736e..fc681de 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -65,9 +65,6 @@ public: bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override; - bool areInlineCompatible(const Function *Caller, - const Function *Callee) const override; - /// @} /// \name Vector TTI Implementations diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp index 9b03e85..28495e7 100644 --- a/llvm/lib/Target/TargetLoweringObjectFile.cpp +++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp @@ -151,17 +151,17 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer, SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags; M.getModuleFlagsMetadata(ModuleFlags); - MDNode *CFGProfile = nullptr; + MDNode *CGProfile = nullptr; for (const auto &MFE : ModuleFlags) { StringRef Key = MFE.Key->getString(); if (Key == "CG Profile") { - CFGProfile = cast<MDNode>(MFE.Val); + CGProfile = cast<MDNode>(MFE.Val); break; } } - if (!CFGProfile) + if (!CGProfile) return; auto GetSym = [this](const MDOperand &MDO) -> MCSymbol * { @@ -174,7 +174,7 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer, return TM->getSymbol(F); }; - for (const auto &Edge : CFGProfile->operands()) { + for (const auto &Edge : CGProfile->operands()) { MDNode *E = cast<MDNode>(Edge); const MCSymbol *From = GetSym(E->getOperand(0)); const MCSymbol *To = GetSym(E->getOperand(1)); diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp index 9e8f400..2cfdc75 100644 --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -1649,14 +1649,11 @@ SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op, // Prepare arguments TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Node = Size; - Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); - Args.push_back(Entry); + Args.emplace_back(Size, Size.getValueType().getTypeForEVT(*DAG.getContext())); if (NeedsAlign) { - Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT); - Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); - Args.push_back(Entry); + SDValue Align = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT); + Args.emplace_back(Align, + Align.getValueType().getTypeForEVT(*DAG.getContext())); } Type *RetTy = Type::getVoidTy(*DAG.getContext()); diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 80df4ed..45bbf12 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -220,7 +220,6 @@ static MCSymbolWasm *getOrCreateFunctionTableSymbol(MCContext &Ctx, Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name)); Sym->setFunctionTable(Is64); // The default function table is synthesized by the linker. - Sym->setUndefined(); } return Sym; } diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp index 2a398d4..fa6086c 100644 --- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp +++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp @@ -26,7 +26,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolWasm.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Endian.h" #include "llvm/Support/LEB128.h" diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index ec95e86..2666342 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -912,6 +912,8 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) { if (!IsVoid) updateValueMap(Call, ResultReg); + + diagnoseDontCall(*Call); return true; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index f9eba4b..35d5c3e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1320,18 +1320,21 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, // signature They are necessary to match callee and caller signature for // indirect call. if (CallConv == CallingConv::Swift) { + Type *PtrTy = PointerType::getUnqual(*DAG.getContext()); if (!HasSwiftSelfArg) { NumFixedArgs++; - ISD::OutputArg Arg; - Arg.Flags.setSwiftSelf(); + ISD::ArgFlagsTy Flags; + Flags.setSwiftSelf(); + ISD::OutputArg Arg(Flags, PtrVT, EVT(PtrVT), PtrTy, 0, 0); CLI.Outs.push_back(Arg); SDValue ArgVal = DAG.getUNDEF(PtrVT); CLI.OutVals.push_back(ArgVal); } if (!HasSwiftErrorArg) { NumFixedArgs++; - ISD::OutputArg Arg; - Arg.Flags.setSwiftError(); + ISD::ArgFlagsTy Flags; + Flags.setSwiftError(); + ISD::OutputArg Arg(Flags, PtrVT, EVT(PtrVT), PtrTy, 0, 0); CLI.Outs.push_back(Arg); SDValue ArgVal = DAG.getUNDEF(PtrVT); CLI.OutVals.push_back(ArgVal); @@ -3383,8 +3386,56 @@ static SDValue TryMatchTrue(SDNode *N, EVT VecVT, SelectionDAG &DAG) { return DAG.getZExtOrTrunc(Ret, DL, N->getValueType(0)); } +/// Try to convert a i128 comparison to a v16i8 comparison before type +/// legalization splits it up into chunks +static SDValue +combineVectorSizedSetCCEquality(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const WebAssemblySubtarget *Subtarget) { + + SDLoc DL(N); + SDValue X = N->getOperand(0); + SDValue Y = N->getOperand(1); + EVT VT = N->getValueType(0); + EVT OpVT = X.getValueType(); + + SelectionDAG &DAG = DCI.DAG; + if (DCI.DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat)) + return SDValue(); + + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + // We're looking for an oversized integer equality comparison with SIMD + if (!OpVT.isScalarInteger() || !OpVT.isByteSized() || OpVT != MVT::i128 || + !Subtarget->hasSIMD128() || !isIntEqualitySetCC(CC)) + return SDValue(); + + // Don't perform this combine if constructing the vector will be expensive. + auto IsVectorBitCastCheap = [](SDValue X) { + X = peekThroughBitcasts(X); + return isa<ConstantSDNode>(X) || X.getOpcode() == ISD::LOAD; + }; + + if (!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) + return SDValue(); + + SDValue VecX = DAG.getBitcast(MVT::v16i8, X); + SDValue VecY = DAG.getBitcast(MVT::v16i8, Y); + SDValue Cmp = DAG.getSetCC(DL, MVT::v16i8, VecX, VecY, CC); + + SDValue Intr = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + {DAG.getConstant(CC == ISD::SETEQ ? Intrinsic::wasm_alltrue + : Intrinsic::wasm_anytrue, + DL, MVT::i32), + Cmp}); + + return DAG.getSetCC(DL, VT, Intr, DAG.getConstant(0, DL, MVT::i32), + ISD::SETNE); +} + static SDValue performSETCCCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const WebAssemblySubtarget *Subtarget) { if (!DCI.isBeforeLegalize()) return SDValue(); @@ -3392,6 +3443,9 @@ static SDValue performSETCCCombine(SDNode *N, if (!VT.isScalarInteger()) return SDValue(); + if (SDValue V = combineVectorSizedSetCCEquality(N, DCI, Subtarget)) + return V; + SDValue LHS = N->getOperand(0); if (LHS->getOpcode() != ISD::BITCAST) return SDValue(); @@ -3571,7 +3625,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, case ISD::BITCAST: return performBitcastCombine(N, DCI); case ISD::SETCC: - return performSETCCCombine(N, DCI); + return performSETCCCombine(N, DCI, Subtarget); case ISD::VECTOR_SHUFFLE: return performVECTOR_SHUFFLECombine(N, DCI); case ISD::SIGN_EXTEND: diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 4548a75..45b0e7d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -533,8 +533,8 @@ struct StaticLibcallNameMap { // different libcalls. RTLIB::RuntimeLibcallsInfo RTCI(TT); for (RTLIB::Libcall LC : RTLIB::libcalls()) { - const char *NameLibcall = RTCI.getLibcallName(LC); - if (NameLibcall != nullptr && + StringRef NameLibcall = RTCI.getLibcallName(LC); + if (!NameLibcall.empty() && getRuntimeLibcallSignatures().Table[LC] != unsupported) { assert(!Map.contains(NameLibcall) && "duplicate libcall names in name map"); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index 52e7065..08fb758 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -147,7 +147,8 @@ WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { Options.AllowOverlappingLoads = true; - // TODO: Teach WebAssembly backend about load v128. + if (ST->hasSIMD128()) + Options.LoadSizes.push_back(16); Options.LoadSizes.append({8, 4, 2, 1}); Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp index 42d1271..8904867 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp @@ -113,7 +113,6 @@ MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol( Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name)); Sym->setFunctionTable(is64); // The default function table is synthesized by the linker. - Sym->setUndefined(); } // MVP object files can't have symtab entries for tables. if (!(Subtarget && Subtarget->hasCallIndirectOverlong())) diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp index d9f4405..c0b9339 100644 --- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp +++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp @@ -69,7 +69,7 @@ public: CCValAssign::LocInfo LocInfo, const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, CCState &State) override { - bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); + bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, Info.Ty, State); StackSize = State.getStackSize(); static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2, diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp index 817e88d..e2a1bbf3 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp @@ -36,11 +36,31 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr<Instruction> &Inst, } } +void X86InstrPostProcess::useStackEngine(std::unique_ptr<Instruction> &Inst, + const MCInst &MCI) { + // TODO(boomanaiden154): We currently do not handle PUSHF/POPF because we + // have not done the necessary benchmarking to see if they are also + // optimized by the stack engine. + // TODO: We currently just remove all RSP writes from stack operations. This + // is not fully correct because we do not model sync uops which will + // delay subsequent rsp using non-stack instructions. + if (X86::isPOP(MCI.getOpcode()) || X86::isPUSH(MCI.getOpcode())) { + auto *StackRegisterDef = + llvm::find_if(Inst->getDefs(), [](const WriteState &State) { + return State.getRegisterID() == X86::RSP; + }); + assert( + StackRegisterDef != Inst->getDefs().end() && + "Expected push instruction to implicitly use stack pointer register."); + Inst->getDefs().erase(StackRegisterDef); + } +} + void X86InstrPostProcess::postProcessInstruction( std::unique_ptr<Instruction> &Inst, const MCInst &MCI) { - // Currently, we only modify certain instructions' IsALoadBarrier and - // IsAStoreBarrier flags. + // Set IsALoadBarrier and IsAStoreBarrier flags. setMemBarriers(Inst, MCI); + useStackEngine(Inst, MCI); } } // namespace mca diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h index 4a83ba8..c5459e4 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h @@ -28,6 +28,11 @@ class X86InstrPostProcess : public InstrPostProcess { /// as load and store barriers. void setMemBarriers(std::unique_ptr<Instruction> &Inst, const MCInst &MCI); + /// Called within X86InstrPostPorcess to remove some rsp read operands + /// on stack instructions to better simulate the stack engine. We currently + /// do not model features of the stack engine like sync uops. + void useStackEngine(std::unique_ptr<Instruction> &Inst, const MCInst &MCI); + public: X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 990b381..3d34ea3 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -1291,7 +1291,9 @@ def ProcessorFeatures { list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps, TuningPreferMovmskOverVTest, TuningFastImmVectorShift]; - list<SubtargetFeature> ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning); + list<SubtargetFeature> ADLRemoveTuning = [TuningPOPCNTFalseDeps]; + list<SubtargetFeature> ADLTuning = + !listremove(!listconcat(SKLTuning, ADLAdditionalTuning), ADLRemoveTuning); list<SubtargetFeature> ADLFeatures = !listconcat(TRMFeatures, ADLAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86CallingConv.h b/llvm/lib/Target/X86/X86CallingConv.h index 191e0fa..8e37f34 100644 --- a/llvm/lib/Target/X86/X86CallingConv.h +++ b/llvm/lib/Target/X86/X86CallingConv.h @@ -22,10 +22,10 @@ namespace llvm { bool RetCC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State); + Type *OrigTy, CCState &State); bool CC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State); + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State); } // End llvm namespace diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 067bd43..f007886 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -3323,6 +3323,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { return false; SmallVector<MVT, 16> OutVTs; + SmallVector<Type *, 16> ArgTys; SmallVector<Register, 16> ArgRegs; // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra @@ -3369,6 +3370,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { ArgRegs.push_back(ResultReg); OutVTs.push_back(VT); + ArgTys.push_back(Val->getType()); } // Analyze operands of the call, assigning locations to each operand. @@ -3379,7 +3381,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { if (IsWin64) CCInfo.AllocateStack(32, Align(8)); - CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); + CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, ArgTys, CC_X86); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 95ed590..cba7843 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/EHPersonalities.h" @@ -2678,7 +2679,7 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, // object. // We need to factor in additional offsets applied during the prologue to the // frame, base, and stack pointer depending on which is used. - int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea(); + int64_t Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea(); const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t StackSize = MFI.getStackSize(); @@ -4212,6 +4213,14 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized( // emitPrologue if it gets called and emits CFI. MF.setHasWinCFI(false); + MachineFrameInfo &MFI = MF.getFrameInfo(); + // If the frame is big enough that we might need to scavenge a register to + // handle huge offsets, reserve a stack slot for that now. + if (!isInt<32>(MFI.estimateStackSize(MF))) { + int FI = MFI.CreateStackObject(SlotSize, Align(SlotSize), false); + RS->addScavengingFrameIndex(FI); + } + // If we are using Windows x64 CFI, ensure that the stack is always 8 byte // aligned. The format doesn't support misaligned stack adjustments. if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f366094..8c3380b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2756,8 +2756,10 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { !Subtarget.hasBWI()) return TypeSplitVector; + // Since v8f16 is legal, widen anything over v4f16. if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && - !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16) + VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() && + VT.getVectorElementType() == MVT::f16) return TypeSplitVector; if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && @@ -15419,18 +15421,18 @@ static SDValue lowerShuffleAsLanePermuteAndPermute( return SDValue(); } - // Avoid returning the same shuffle operation. For example, - // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5, - // undef:v16i16 - if (CrossLaneMask == Mask || InLaneMask == Mask) - return SDValue(); - // Simplify CrossLaneMask based on the actual demanded elements. if (V1.hasOneUse()) for (int i = 0; i != NumElts; ++i) if (!DemandedCrossLane[i]) CrossLaneMask[i] = SM_SentinelUndef; + // Avoid returning the same shuffle operation. For example, + // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5, + // undef:v16i16 + if (CrossLaneMask == Mask || InLaneMask == Mask) + return SDValue(); + SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask); return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT), InLaneMask); @@ -22219,9 +22221,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { In = DAG.getBitcast(MVT::i16, In); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Node = In; - Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext()); + TargetLowering::ArgListEntry Entry( + In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext())); Entry.IsSExt = false; Entry.IsZExt = true; Args.push_back(Entry); @@ -22318,9 +22319,8 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Node = In; - Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext()); + TargetLowering::ArgListEntry Entry( + In, EVT(SVT).getTypeForEVT(*DAG.getContext())); Entry.IsSExt = false; Entry.IsZExt = true; Args.push_back(Entry); @@ -30049,7 +30049,6 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { EVT ArgVT = Op->getOperand(i).getValueType(); assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && @@ -30058,13 +30057,9 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); - Entry.Node = StackPtr; InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16)); - Entry.Ty = PointerType::get(*DAG.getContext(), 0); - Entry.IsSExt = false; - Entry.IsZExt = false; - Args.push_back(Entry); + Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0)); } SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), @@ -33087,13 +33082,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - - Entry.Node = Arg; - Entry.Ty = ArgTy; - Entry.IsSExt = false; - Entry.IsZExt = false; - Args.push_back(Entry); + Args.emplace_back(Arg, ArgTy); bool isF64 = ArgVT == MVT::f64; // Only optimize x86_64 for now. i386 is a bit messy. For f32, @@ -45163,6 +45152,9 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( case X86ISD::PCMPEQ: case X86ISD::PCMPGT: return false; + // SSE signbit extraction. + case X86ISD::MOVMSK: + return false; case ISD::INTRINSIC_WO_CHAIN: switch (Op->getConstantOperandVal(0)) { case Intrinsic::x86_sse2_pmadd_wd: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 547b221..3dd79b3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1663,8 +1663,8 @@ namespace llvm { /// instructions/intrinsics. bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, - ArrayRef<unsigned> Indices, - unsigned Factor) const override; + ArrayRef<unsigned> Indices, unsigned Factor, + const APInt &GapMask) const override; /// Lower interleaved store(s) into target specific /// instructions/intrinsics. diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 636b072..632db7e 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -802,7 +802,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, - ArrayRef<unsigned> Indices, unsigned Factor) const { + ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); @@ -812,7 +812,7 @@ bool X86TargetLowering::lowerInterleavedLoad( auto *LI = dyn_cast<LoadInst>(Load); if (!LI) return false; - assert(!Mask && "Unexpected mask on a load"); + assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load"); // Create an interleaved access group. IRBuilder<> Builder(LI); diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index cf055cf..090060e 100644 --- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -491,7 +491,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph( NumGadgets += GadgetCount; // Traverse CFG to build the rest of the graph - SmallSet<MachineBasicBlock *, 8> BlocksVisited; + SmallPtrSet<MachineBasicBlock *, 8> BlocksVisited; std::function<void(MachineBasicBlock *, GraphIter, unsigned)> TraverseCFG = [&](MachineBasicBlock *MBB, GraphIter GI, unsigned ParentDepth) { unsigned LoopDepth = MLI.getLoopDepth(MBB); diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index 3b4e531..2a1c499 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -100,7 +100,7 @@ struct BBInfo { class X86PreTileConfig : public MachineFunctionPass { MachineRegisterInfo *MRI = nullptr; const MachineLoopInfo *MLI = nullptr; - SmallSet<MachineInstr *, 8> DefVisited; + SmallPtrSet<MachineInstr *, 8> DefVisited; DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo; DenseMap<MachineBasicBlock *, SmallVector<MIRef, 8>> ShapeBBs; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 83b11ee..595ad32 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -21,8 +21,8 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TileShapeInfo.h" @@ -907,7 +907,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); // Determine base register and offset. - int FIOffset; + int64_t FIOffset; Register BasePtr; if (MI.isReturn()) { assert((!hasStackRealignment(MF) || @@ -958,11 +958,41 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } if (MI.getOperand(FIOperandNum+3).isImm()) { - // Offset is a 32-bit integer. - int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm()); - int Offset = FIOffset + Imm; - assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) && - "Requesting 64-bit offset in 32-bit immediate!"); + const X86InstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + int64_t Imm = MI.getOperand(FIOperandNum + 3).getImm(); + int64_t Offset = FIOffset + Imm; + bool FitsIn32Bits = isInt<32>(Offset); + // If the offset will not fit in a 32-bit displacement, then for 64-bit + // targets, scavenge a register to hold it. Otherwise... + if (Is64Bit && !FitsIn32Bits) { + assert(RS && "RegisterScavenger was NULL"); + + RS->enterBasicBlockEnd(MBB); + RS->backward(std::next(II)); + + Register ScratchReg = RS->scavengeRegisterBackwards( + X86::GR64RegClass, II, /*RestoreAfter=*/false, /*SPAdj=*/0, + /*AllowSpill=*/true); + assert(ScratchReg != 0 && "scratch reg was 0"); + RS->setRegUsed(ScratchReg); + + BuildMI(MBB, II, DL, TII->get(X86::MOV64ri), ScratchReg).addImm(Offset); + + MI.getOperand(FIOperandNum + 3).setImm(0); + MI.getOperand(FIOperandNum + 2).setReg(ScratchReg); + + return false; + } + + // ... for 32-bit targets, this is a bug! + if (!Is64Bit && !FitsIn32Bits) { + MI.emitGenericError("64-bit offset calculated but target is 32-bit"); + // Trap so that the instruction verification pass does not fail if run. + BuildMI(MBB, MBBI, DL, TII->get(X86::TRAP)); + return false; + } + if (Offset != 0 || !tryOptimizeLEAtoMOV(II)) MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset); } else { diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h index 19b409a..2f4c55c 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/llvm/lib/Target/X86/X86RegisterInfo.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H #define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #define GET_REGINFO_HEADER @@ -180,6 +181,10 @@ public: constrainRegClassToNonRex2(const TargetRegisterClass *RC) const; bool isNonRex2RegClass(const TargetRegisterClass *RC) const; + + bool requiresRegisterScavenging(const MachineFunction &MF) const override { + return true; + } }; } // End llvm namespace diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 8cd52e2..f15a7c7 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -70,6 +70,12 @@ def SKLPortAny : ProcResGroup<[SKLPort0, SKLPort1, SKLPort2, SKLPort3, SKLPort4, let BufferSize=60; } +// Skylake can retire up to four (potentially fused) uops per cycle. Set the +// limit to twice that given we do not model fused uops as only taking up one +// retirement slot. I could not find any documented sources on how many +// in-flight micro-ops can be tracked. +def SKRCU : RetireControlUnit<0, 8>; + // Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 // cycles after the memory operand. def : ReadAdvance<ReadAfterLd, 5>; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 14a51d1e..2a793d0 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -70,6 +70,12 @@ def SKXPortAny : ProcResGroup<[SKXPort0, SKXPort1, SKXPort2, SKXPort3, SKXPort4, let BufferSize=60; } +// Skylake can retire up to four (potentially fused) uops per cycle. Set the +// limit to twice that given we do not model fused uops as only taking up one +// retirement slot. I could not find any documented sources on how many +// in-flight micro-ops can be tracked. +def SKXRCU : RetireControlUnit<0, 8>; + // Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 // cycles after the memory operand. def : ReadAdvance<ReadAfterLd, 5>; diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td index c92bc97..133c1a4 100644 --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -562,14 +562,7 @@ def AtomWrite0_1_7_4 : SchedWriteRes<[AtomPort0,AtomPort1]> { let ReleaseAtCycles = [8,8]; let NumMicroOps = 4; } -def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrr(_Int)?")>; - -def AtomWrite0_1_8_4 : SchedWriteRes<[AtomPort0,AtomPort1]> { - let Latency = 8; - let ReleaseAtCycles = [8,8]; - let NumMicroOps = 4; -} -def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrm(_Int)?")>; +def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSr(r|m)(_Int)?")>; def AtomWrite0_1_9 : SchedWriteRes<[AtomPort0,AtomPort1]> { let Latency = 9; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 90791fc..62f9527 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -161,19 +161,26 @@ std::optional<unsigned> X86TTIImpl::getCacheAssociativity( llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); } +enum ClassIDEnum { GPRClass = 0, VectorClass = 1, ScalarFPClass = 2 }; + +unsigned X86TTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const { + return Vector ? VectorClass + : Ty && Ty->isFloatingPointTy() ? ScalarFPClass + : GPRClass; +} + unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { - bool Vector = (ClassID == 1); - if (Vector && !ST->hasSSE1()) + if (ClassID == VectorClass && !ST->hasSSE1()) return 0; - if (ST->is64Bit()) { - if (Vector && ST->hasAVX512()) - return 32; - if (!Vector && ST->hasEGPR()) - return 32; - return 16; - } - return 8; + if (!ST->is64Bit()) + return 8; + + if ((ClassID == GPRClass && ST->hasEGPR()) || + (ClassID != GPRClass && ST->hasAVX512())) + return 32; + + return 16; } bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const { @@ -5488,9 +5495,10 @@ InstructionCost X86TTIImpl::getPointersChainCost( return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); } -InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, - ScalarEvolution *SE, - const SCEV *Ptr) const { +InstructionCost +X86TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, + const SCEV *Ptr, + TTI::TargetCostKind CostKind) const { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -5504,7 +5512,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, // Even in the case of (loop invariant) stride whose value is not known at // compile time, the address computation will not incur more than one extra // ADD instruction. - if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { + if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) { // TODO: AVX2 is the current cut-off because we don't have correct // interleaving costs for prior ISA's. if (!BaseT::isStridedAccess(Ptr)) @@ -5513,7 +5521,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, return 1; } - return BaseT::getAddressComputationCost(Ty, SE, Ptr); + return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind); } InstructionCost @@ -6525,8 +6533,8 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, for (const Instruction &I : instructions(Callee)) { if (const auto *CB = dyn_cast<CallBase>(&I)) { - // Having more target features is fine for inline ASM. - if (CB->isInlineAsm()) + // Having more target features is fine for inline ASM and intrinsics. + if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic) continue; SmallVector<Type *, 8> Types; @@ -6542,19 +6550,9 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, if (all_of(Types, IsSimpleTy)) continue; - if (Function *NestedCallee = CB->getCalledFunction()) { - // Assume that intrinsics are always ABI compatible. - if (NestedCallee->isIntrinsic()) - continue; - - // Do a precise compatibility check. - if (!areTypesABICompatible(Caller, NestedCallee, Types)) - return false; - } else { - // We don't know the target features of the callee, - // assume it is incompatible. + // Do a precise compatibility check. + if (!areTypesABICompatible(Caller, Callee, Types)) return false; - } } } return true; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index bc06c47..133b366 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -132,6 +132,7 @@ public: /// @{ unsigned getNumberOfRegisters(unsigned ClassID) const override; + unsigned getRegisterClassForType(bool Vector, Type *Ty) const override; bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override; TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override; @@ -194,8 +195,9 @@ public: getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override; - InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, - const SCEV *Ptr) const override; + InstructionCost + getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, + TTI::TargetCostKind CostKind) const override; std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override; diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp index e9081a4..ea8b88f 100644 --- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp +++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp @@ -190,6 +190,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { State = FunctionState::FinishedEpilog; break; + case X86::LEA64r: case X86::MOV64rr: case X86::ADD64ri32: if (State == FunctionState::InEpilog) { @@ -201,51 +202,56 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { "The epilog is deallocating a stack " "allocation, but the prolog did " "not allocate one"); - if (HasStackDealloc) + if (PoppedRegCount > 0) return rejectCurrentFunctionInternalError( MF, Mode, - "The epilog is deallocating the stack " - "allocation more than once"); - if (PoppedRegCount > 0) - llvm_unreachable( - "Should have raised an error: either popping before " - "deallocating or deallocating without an allocation"); + "The epilog is deallocating a stack allocation after popping " + "registers"); HasStackDealloc = true; } else if (State == FunctionState::FinishedEpilog) return rejectCurrentFunctionInternalError( - MF, Mode, "Unexpected mov or add instruction after the epilog"); + MF, Mode, + "Unexpected lea, mov or add instruction after the epilog"); break; case X86::POP64r: if (State == FunctionState::InEpilog) { - // After the stack pointer has been adjusted, the epilog must - // POP each register in reverse order of the PUSHes in the prolog. - PoppedRegCount++; - if (HasStackAlloc != HasStackDealloc) - return rejectCurrentFunctionInternalError( - MF, Mode, - "Cannot pop registers before the stack " - "allocation has been deallocated"); - if (PoppedRegCount > PushedRegs.size()) - return rejectCurrentFunctionInternalError( - MF, Mode, - "The epilog is popping more registers than the prolog pushed"); - if (PushedRegs[PushedRegs.size() - PoppedRegCount] != - MI.getOperand(0).getReg()) - return rejectCurrentFunctionInternalError( - MF, Mode, - "The epilog is popping a registers in " - "a different order than the " - "prolog pushed them"); - - // Unwind v2 records the size of the epilog not from where we place - // SEH_BeginEpilogue (as that contains the instruction to adjust the - // stack pointer) but from the first POP instruction (if there is - // one). - if (!UnwindV2StartLocation) { - assert(PoppedRegCount == 1); - UnwindV2StartLocation = &MI; + Register Reg = MI.getOperand(0).getReg(); + if (HasStackAlloc && (PoppedRegCount == 0) && + !llvm::is_contained(PushedRegs, Reg)) { + // If this is a pop that doesn't correspond to the set of pushed + // registers, then assume it was used to adjust the stack pointer. + HasStackDealloc = true; + } else { + // After the stack pointer has been adjusted, the epilog must + // POP each register in reverse order of the PUSHes in the prolog. + PoppedRegCount++; + if (HasStackAlloc != HasStackDealloc) + return rejectCurrentFunctionInternalError( + MF, Mode, + "Cannot pop registers before the stack " + "allocation has been deallocated"); + if (PoppedRegCount > PushedRegs.size()) + return rejectCurrentFunctionInternalError( + MF, Mode, + "The epilog is popping more registers than the prolog " + "pushed"); + if (PushedRegs[PushedRegs.size() - PoppedRegCount] != Reg.id()) + return rejectCurrentFunctionInternalError( + MF, Mode, + "The epilog is popping a registers in " + "a different order than the " + "prolog pushed them"); + + // Unwind v2 records the size of the epilog not from where we place + // SEH_BeginEpilogue (as that contains the instruction to adjust the + // stack pointer) but from the first POP instruction (if there is + // one). + if (!UnwindV2StartLocation) { + assert(PoppedRegCount == 1); + UnwindV2StartLocation = &MI; + } } } else if (State == FunctionState::FinishedEpilog) // Unexpected instruction after the epilog. diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/llvm/lib/Target/XCore/XCoreISelLowering.cpp index ef4cfcd..0a96ab2 100644 --- a/llvm/lib/Target/XCore/XCoreISelLowering.cpp +++ b/llvm/lib/Target/XCore/XCoreISelLowering.cpp @@ -429,11 +429,7 @@ SDValue XCoreTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // Lower to a call to __misaligned_load(BasePtr). Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(Context); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - - Entry.Ty = IntPtrTy; - Entry.Node = BasePtr; - Args.push_back(Entry); + Args.emplace_back(BasePtr, IntPtrTy); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( @@ -480,14 +476,8 @@ SDValue XCoreTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // Lower to a call to __misaligned_store(BasePtr, Value). Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(Context); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - - Entry.Ty = IntPtrTy; - Entry.Node = BasePtr; - Args.push_back(Entry); - - Entry.Node = Value; - Args.push_back(Entry); + Args.emplace_back(BasePtr, IntPtrTy); + Args.emplace_back(Value, IntPtrTy); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain).setCallee( diff --git a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp index 1bd92a2..f61115e 100644 --- a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp +++ b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp @@ -33,11 +33,10 @@ SDValue XCoreSelectionDAGInfo::EmitTargetCodeForMemcpy( DAG.MaskedValueIsZero(Size, APInt(SizeBitWidth, 3))) { const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering(); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); - Entry.Node = Dst; Args.push_back(Entry); - Entry.Node = Src; Args.push_back(Entry); - Entry.Node = Size; Args.push_back(Entry); + Type *ArgTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Args.emplace_back(Dst, ArgTy); + Args.emplace_back(Src, ArgTy); + Args.emplace_back(Size, ArgTy); const char *MemcpyAlign4Name = TLI.getLibcallName(RTLIB::MEMCPY_ALIGN_4); CallingConv::ID CC = TLI.getLibcallCallingConv(RTLIB::MEMCPY_ALIGN_4); diff --git a/llvm/lib/Target/Xtensa/Xtensa.td b/llvm/lib/Target/Xtensa/Xtensa.td index 2c4bacb..4ef885e1 100644 --- a/llvm/lib/Target/Xtensa/Xtensa.td +++ b/llvm/lib/Target/Xtensa/Xtensa.td @@ -23,10 +23,8 @@ include "XtensaFeatures.td" //===----------------------------------------------------------------------===// // Xtensa supported processors. //===----------------------------------------------------------------------===// -class Proc<string Name, list<SubtargetFeature> Features> - : Processor<Name, NoItineraries, Features>; -def : Proc<"generic", []>; +include "XtensaProcessors.td" //===----------------------------------------------------------------------===// // Register File Description diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp index 6a07bd8..f136703 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp +++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp @@ -353,7 +353,8 @@ static const MCPhysReg IntRegs[] = {Xtensa::A2, Xtensa::A3, Xtensa::A4, static bool CC_Xtensa_Custom(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State) { if (ArgFlags.isByVal()) { Align ByValAlign = ArgFlags.getNonZeroByValAlign(); unsigned ByValSize = ArgFlags.getByValSize(); diff --git a/llvm/lib/Target/Xtensa/XtensaProcessors.td b/llvm/lib/Target/Xtensa/XtensaProcessors.td new file mode 100644 index 0000000..0faf07d --- /dev/null +++ b/llvm/lib/Target/Xtensa/XtensaProcessors.td @@ -0,0 +1,27 @@ +//===- XtensaProcessors.td - Xtensa Processors -------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Xtensa supported processors. +//===----------------------------------------------------------------------===// +class Proc<string Name, list<SubtargetFeature> Features> + : Processor<Name, NoItineraries, Features>; + +def : Proc<"generic", []>; + +def : Proc<"esp32", [FeatureDensity, FeatureSingleFloat, FeatureLoop, FeatureMAC16, FeatureWindowed, FeatureBoolean, FeatureSEXT, + FeatureNSA, FeatureMul16, FeatureMul32, FeatureMul32High, FeatureDFPAccel, FeatureS32C1I, FeatureTHREADPTR, FeatureDiv32, + FeatureDebug, FeatureException, FeatureHighPriInterrupts, FeatureHighPriInterruptsLevel7, FeatureCoprocessor, + FeatureInterrupt, FeatureDataCache, FeatureRelocatableVector, FeatureTimers3, FeaturePRID, FeatureRegionProtection, FeatureMiscSR, + FeatureMINMAX, FeatureCLAMPS]>; + +def : Proc<"esp8266", [FeatureDensity, FeatureNSA, FeatureMul16, FeatureMul32, FeatureExtendedL32R, FeatureDebug, FeatureException, + FeatureHighPriInterrupts, FeatureHighPriInterruptsLevel3, FeatureInterrupt, FeatureRelocatableVector, FeatureTimers1, + FeatureRegionProtection, FeaturePRID]>; diff --git a/llvm/lib/TargetParser/CMakeLists.txt b/llvm/lib/TargetParser/CMakeLists.txt index 8f8b3a5..62e97bf 100644 --- a/llvm/lib/TargetParser/CMakeLists.txt +++ b/llvm/lib/TargetParser/CMakeLists.txt @@ -27,6 +27,7 @@ add_llvm_component_library(LLVMTargetParser TargetParser.cpp Triple.cpp X86TargetParser.cpp + XtensaTargetParser.cpp ADDITIONAL_HEADER_DIRS Unix diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 22192e1f..2482753 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -759,20 +759,20 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family, StringRef CPU; switch (Family) { - case 3: + case 0x3: CPU = "i386"; break; - case 4: + case 0x4: CPU = "i486"; break; - case 5: + case 0x5: if (testFeature(X86::FEATURE_MMX)) { CPU = "pentium-mmx"; break; } CPU = "pentium"; break; - case 6: + case 0x6: switch (Model) { case 0x0f: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile // processor, Intel Core 2 Quad processor, Intel Core 2 Quad @@ -1120,7 +1120,7 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family, break; } break; - case 15: { + case 0xf: { if (testFeature(X86::FEATURE_64BIT)) { CPU = "nocona"; break; @@ -1132,7 +1132,7 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family, CPU = "pentium4"; break; } - case 19: + case 0x13: switch (Model) { // Diamond Rapids: case 0x01: diff --git a/llvm/lib/TargetParser/XtensaTargetParser.cpp b/llvm/lib/TargetParser/XtensaTargetParser.cpp new file mode 100644 index 0000000..25725f2 --- /dev/null +++ b/llvm/lib/TargetParser/XtensaTargetParser.cpp @@ -0,0 +1,93 @@ +//==-- XtensaTargetParser - Parser for Xtensa features ------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a target parser to recognise Xtensa hardware features +// +//===----------------------------------------------------------------------===// + +#include "llvm/TargetParser/XtensaTargetParser.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" + +namespace llvm { + +namespace Xtensa { +struct CPUInfo { + StringLiteral Name; + CPUKind Kind; + uint64_t Features; +}; + +struct FeatureName { + uint64_t ID; + const char *NameCStr; + size_t NameLength; + + StringRef getName() const { return StringRef(NameCStr, NameLength); } +}; + +const FeatureName XtensaFeatureNames[] = { +#define XTENSA_FEATURE(ID, NAME) {ID, "+" NAME, sizeof(NAME)}, +#include "llvm/TargetParser/XtensaTargetParser.def" +}; + +constexpr CPUInfo XtensaCPUInfo[] = { +#define XTENSA_CPU(ENUM, NAME, FEATURES) {NAME, CK_##ENUM, FEATURES}, +#include "llvm/TargetParser/XtensaTargetParser.def" +}; + +StringRef getBaseName(StringRef CPU) { + return llvm::StringSwitch<StringRef>(CPU) +#define XTENSA_CPU_ALIAS(NAME, ANAME) .Case(ANAME, NAME) +#include "llvm/TargetParser/XtensaTargetParser.def" + .Default(CPU); +} + +StringRef getAliasName(StringRef CPU) { + return llvm::StringSwitch<StringRef>(CPU) +#define XTENSA_CPU_ALIAS(NAME, ANAME) .Case(NAME, ANAME) +#include "llvm/TargetParser/XtensaTargetParser.def" + .Default(CPU); +} + +CPUKind parseCPUKind(StringRef CPU) { + CPU = getBaseName(CPU); + return llvm::StringSwitch<CPUKind>(CPU) +#define XTENSA_CPU(ENUM, NAME, FEATURES) .Case(NAME, CK_##ENUM) +#include "llvm/TargetParser/XtensaTargetParser.def" + .Default(CK_INVALID); +} + +// Get all features for the CPU +void getCPUFeatures(StringRef CPU, std::vector<StringRef> &Features) { + CPU = getBaseName(CPU); + auto I = llvm::find_if(XtensaCPUInfo, + [&](const CPUInfo &CI) { return CI.Name == CPU; }); + assert(I != std::end(XtensaCPUInfo) && "CPU not found!"); + uint64_t Bits = I->Features; + + for (const auto &F : XtensaFeatureNames) { + if ((Bits & F.ID) == F.ID) + Features.push_back(F.getName()); + } +} + +// Find all valid CPUs +void fillValidCPUList(std::vector<StringRef> &Values) { + for (const auto &C : XtensaCPUInfo) { + if (C.Kind != CK_INVALID) { + Values.emplace_back(C.Name); + StringRef Name = getAliasName(C.Name); + if (Name != C.Name) + Values.emplace_back(Name); + } + } +} + +} // namespace Xtensa +} // namespace llvm diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index 3320508..b775c43 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -1821,7 +1821,7 @@ static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape, // only used outside the region. if (Valid && Lifetimes.size() != 0) { auto *NewLifetime = Lifetimes[0]->clone(); - NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), AI); + NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(0), AI); NewLifetime->insertBefore(DomBB->getTerminator()->getIterator()); // All the outsided lifetime.start markers are no longer necessary. diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index ab906f9..180ac9c 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -2252,6 +2252,10 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C, UR.CWorklist.insert(CurrentSCC); for (Function *Clone : Clones) UR.CWorklist.insert(CG.lookupSCC(CG.get(*Clone))); + } else if (Shape.ABI == coro::ABI::Async) { + // Reprocess the function to inline the tail called return function of + // coro.async.end. + UR.CWorklist.insert(&C); } } diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp index 4e71768..d5d60a3 100644 --- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp +++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp @@ -264,11 +264,6 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> { } void visitIntrinsicInst(IntrinsicInst &II) { - // When we found the lifetime markers refers to a - // subrange of the original alloca, ignore the lifetime - // markers to avoid misleading the analysis. - if (!IsOffsetKnown || !Offset.isZero()) - return Base::visitIntrinsicInst(II); switch (II.getIntrinsicID()) { default: return Base::visitIntrinsicInst(II); diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp index da60f52..042578d 100644 --- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp +++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp @@ -53,7 +53,6 @@ #include "llvm/Transforms/IPO/ExpandVariadics.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" @@ -226,13 +225,6 @@ public: /*IsVarArgs=*/false); } - static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL, - AllocaInst *Alloced) { - std::optional<TypeSize> AllocaTypeSize = Alloced->getAllocationSize(DL); - uint64_t AsInt = AllocaTypeSize ? AllocaTypeSize->getFixedValue() : 0; - return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt); - } - bool expansionApplicableToFunction(Module &M, Function *F) { if (F->isIntrinsic() || !F->isVarArg() || F->hasFnAttribute(Attribute::Naked)) @@ -577,8 +569,7 @@ ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder, AllocaInst *VaListInstance = Builder.CreateAlloca(VaListTy, nullptr, "va_start"); - Builder.CreateLifetimeStart(VaListInstance, - sizeOfAlloca(Ctx, DL, VaListInstance)); + Builder.CreateLifetimeStart(VaListInstance); Builder.CreateIntrinsic(Intrinsic::vastart, {DL.getAllocaPtrType(Ctx)}, {VaListInstance}); @@ -595,8 +586,7 @@ ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder, Builder.CreateIntrinsic(Intrinsic::vaend, {DL.getAllocaPtrType(Ctx)}, {VaListInstance}); - Builder.CreateLifetimeEnd(VaListInstance, - sizeOfAlloca(Ctx, DL, VaListInstance)); + Builder.CreateLifetimeEnd(VaListInstance); if (Result->getType()->isVoidTy()) Builder.CreateRetVoid(); @@ -746,7 +736,7 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, // Initialize the fields in the struct Builder.SetInsertPoint(CB); - Builder.CreateLifetimeStart(Alloced, sizeOfAlloca(Ctx, DL, Alloced)); + Builder.CreateLifetimeStart(Alloced); Frame.initializeStructAlloca(DL, Builder, Alloced); const unsigned NumArgs = FuncType->getNumParams(); @@ -762,7 +752,7 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, Builder.SetCurrentDebugLocation(CB->getStableDebugLoc()); VaList = Builder.CreateAlloca(VaListTy, nullptr, "va_argument"); Builder.SetInsertPoint(CB); - Builder.CreateLifetimeStart(VaList, sizeOfAlloca(Ctx, DL, VaList)); + Builder.CreateLifetimeStart(VaList); } Builder.SetInsertPoint(CB); Args.push_back(ABI->initializeVaList(M, Ctx, Builder, VaList, Alloced)); @@ -802,9 +792,9 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, } if (VaList) - Builder.CreateLifetimeEnd(VaList, sizeOfAlloca(Ctx, DL, VaList)); + Builder.CreateLifetimeEnd(VaList); - Builder.CreateLifetimeEnd(Alloced, sizeOfAlloca(Ctx, DL, Alloced)); + Builder.CreateLifetimeEnd(Alloced); NewCB->setAttributes(PAL); NewCB->takeName(CB); diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 8262c8c..44394f6 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -273,7 +273,7 @@ MemoryEffects llvm::computeFunctionBodyMemoryAccess(Function &F, /// Deduce readonly/readnone/writeonly attributes for the SCC. template <typename AARGetterT> static void addMemoryAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter, - SmallSet<Function *, 8> &Changed) { + SmallPtrSet<Function *, 8> &Changed) { MemoryEffects ME = MemoryEffects::none(); MemoryEffects RecursiveArgME = MemoryEffects::none(); for (Function *F : SCCNodes) { @@ -1002,7 +1002,7 @@ determinePointerAccessAttrs(Argument *A, /// Deduce returned attributes for the SCC. static void addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes, - SmallSet<Function *, 8> &Changed) { + SmallPtrSet<Function *, 8> &Changed) { // Check each function in turn, determining if an argument is always returned. for (Function *F : SCCNodes) { // We can infer and propagate function attributes only when we know that the @@ -1238,7 +1238,7 @@ static bool inferInitializes(Argument &A, Function &F) { /// Deduce nocapture attributes for the SCC. static void addArgumentAttrs(const SCCNodeSet &SCCNodes, - SmallSet<Function *, 8> &Changed, + SmallPtrSet<Function *, 8> &Changed, bool SkipInitializes) { ArgumentGraph AG; @@ -1510,7 +1510,7 @@ static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) { /// Deduce noalias attributes for the SCC. static void addNoAliasAttrs(const SCCNodeSet &SCCNodes, - SmallSet<Function *, 8> &Changed) { + SmallPtrSet<Function *, 8> &Changed) { // Check each function in turn, determining which functions return noalias // pointers. for (Function *F : SCCNodes) { @@ -1623,7 +1623,7 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes, /// Deduce nonnull attributes for the SCC. static void addNonNullAttrs(const SCCNodeSet &SCCNodes, - SmallSet<Function *, 8> &Changed) { + SmallPtrSet<Function *, 8> &Changed) { // Speculative that all functions in the SCC return only nonnull // pointers. We may refute this as we analyze functions. bool SCCReturnsNonNull = true; @@ -1680,7 +1680,7 @@ static void addNonNullAttrs(const SCCNodeSet &SCCNodes, /// Deduce noundef attributes for the SCC. static void addNoUndefAttrs(const SCCNodeSet &SCCNodes, - SmallSet<Function *, 8> &Changed) { + SmallPtrSet<Function *, 8> &Changed) { // Check each function in turn, determining which functions return noundef // values. for (Function *F : SCCNodes) { @@ -1788,13 +1788,13 @@ public: InferenceDescriptors.push_back(AttrInference); } - void run(const SCCNodeSet &SCCNodes, SmallSet<Function *, 8> &Changed); + void run(const SCCNodeSet &SCCNodes, SmallPtrSet<Function *, 8> &Changed); }; /// Perform all the requested attribute inference actions according to the /// attribute predicates stored before. void AttributeInferer::run(const SCCNodeSet &SCCNodes, - SmallSet<Function *, 8> &Changed) { + SmallPtrSet<Function *, 8> &Changed) { SmallVector<InferenceDescriptor, 4> InferInSCC = InferenceDescriptors; // Go through all the functions in SCC and check corresponding attribute // assumptions for each of them. Attributes that are invalid for this SCC @@ -1969,7 +1969,7 @@ static bool InstrBreaksNoSync(Instruction &I, const SCCNodeSet &SCCNodes) { /// /// Returns true if any changes to function attributes were made. static void inferConvergent(const SCCNodeSet &SCCNodes, - SmallSet<Function *, 8> &Changed) { + SmallPtrSet<Function *, 8> &Changed) { AttributeInferer AI; // Request to remove the convergent attribute from all functions in the SCC @@ -2000,7 +2000,7 @@ static void inferConvergent(const SCCNodeSet &SCCNodes, /// /// Returns true if any changes to function attributes were made. static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes, - SmallSet<Function *, 8> &Changed) { + SmallPtrSet<Function *, 8> &Changed) { AttributeInferer AI; if (!DisableNoUnwindInference) @@ -2069,7 +2069,7 @@ static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes, } static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes, - SmallSet<Function *, 8> &Changed) { + SmallPtrSet<Function *, 8> &Changed) { // Try and identify functions that do not recurse. // If the SCC contains multiple nodes we know for sure there is recursion. @@ -2105,7 +2105,7 @@ static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes, // Set the noreturn function attribute if possible. static void addNoReturnAttrs(const SCCNodeSet &SCCNodes, - SmallSet<Function *, 8> &Changed) { + SmallPtrSet<Function *, 8> &Changed) { for (Function *F : SCCNodes) { if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) || F->doesNotReturn()) @@ -2166,7 +2166,7 @@ static bool allPathsGoThroughCold(Function &F) { // Set the cold function attribute if possible. static void addColdAttrs(const SCCNodeSet &SCCNodes, - SmallSet<Function *, 8> &Changed) { + SmallPtrSet<Function *, 8> &Changed) { for (Function *F : SCCNodes) { if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) || F->hasFnAttribute(Attribute::Cold) || F->hasFnAttribute(Attribute::Hot)) @@ -2213,7 +2213,7 @@ static bool functionWillReturn(const Function &F) { // Set the willreturn function attribute if possible. static void addWillReturn(const SCCNodeSet &SCCNodes, - SmallSet<Function *, 8> &Changed) { + SmallPtrSet<Function *, 8> &Changed) { for (Function *F : SCCNodes) { if (!F || F->willReturn() || !functionWillReturn(*F)) continue; @@ -2239,7 +2239,7 @@ static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) { } template <typename AARGetterT> -static SmallSet<Function *, 8> +static SmallPtrSet<Function *, 8> deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter, bool ArgAttrsOnly) { SCCNodesResult Nodes = createSCCNodeSet(Functions); @@ -2248,7 +2248,7 @@ deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter, if (Nodes.SCCNodes.empty()) return {}; - SmallSet<Function *, 8> Changed; + SmallPtrSet<Function *, 8> Changed; if (ArgAttrsOnly) { // ArgAttrsOnly means to only infer attributes that may aid optimizations // on the *current* function. "initializes" attribute is to aid diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 45fa9d5..9196a01 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -16,7 +16,6 @@ #include "llvm/Analysis/ValueLattice.h" #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/Transforms/Scalar/SCCP.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/SCCPSolver.h" @@ -400,12 +399,6 @@ Constant *InstCostVisitor::visitFreezeInst(FreezeInst &I) { Constant *InstCostVisitor::visitCallBase(CallBase &I) { assert(LastVisited != KnownConstants.end() && "Invalid iterator!"); - // Look through calls to ssa_copy intrinsics. - if (auto *II = dyn_cast<IntrinsicInst>(&I); - II && II->getIntrinsicID() == Intrinsic::ssa_copy) { - return LastVisited->second; - } - Function *F = I.getCalledFunction(); if (!F || !canConstantFoldCallTo(&I, F)) return nullptr; @@ -611,17 +604,15 @@ void FunctionSpecializer::promoteConstantStackValues(Function *F) { } } -// ssa_copy intrinsics are introduced by the SCCP solver. These intrinsics -// interfere with the promoteConstantStackValues() optimization. +// The SCCP solver inserts bitcasts for PredicateInfo. These interfere with the +// promoteConstantStackValues() optimization. static void removeSSACopy(Function &F) { for (BasicBlock &BB : F) { for (Instruction &Inst : llvm::make_early_inc_range(BB)) { - auto *II = dyn_cast<IntrinsicInst>(&Inst); - if (!II) - continue; - if (II->getIntrinsicID() != Intrinsic::ssa_copy) + auto *BC = dyn_cast<BitCastInst>(&Inst); + if (!BC || BC->getType() != BC->getOperand(0)->getType()) continue; - Inst.replaceAllUsesWith(II->getOperand(0)); + Inst.replaceAllUsesWith(BC->getOperand(0)); Inst.eraseFromParent(); } } diff --git a/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/llvm/lib/Transforms/IPO/GlobalDCE.cpp index 45fb1f5..c576fbc 100644 --- a/llvm/lib/Transforms/IPO/GlobalDCE.cpp +++ b/llvm/lib/Transforms/IPO/GlobalDCE.cpp @@ -21,6 +21,8 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/CtorUtils.h" @@ -30,6 +32,35 @@ using namespace llvm; #define DEBUG_TYPE "globaldce" +namespace { +class GlobalDCELegacyPass : public ModulePass { +public: + static char ID; // Pass identification, replacement for typeid + GlobalDCELegacyPass() : ModulePass(ID) { + initializeGlobalDCELegacyPassPass(*PassRegistry::getPassRegistry()); + } + bool runOnModule(Module &M) override { + if (skipModule(M)) + return false; + // Note: GlobalDCEPass does not use any analyses, so we're safe to call the + // new-pm style pass with a default-initialized analysis manager here + ModuleAnalysisManager MAM; + auto PA = Impl.run(M, MAM); + return !PA.areAllPreserved(); + } + +private: + GlobalDCEPass Impl; +}; +} // namespace + +char GlobalDCELegacyPass::ID = 0; +INITIALIZE_PASS(GlobalDCELegacyPass, "globaldce", "Dead Global Elimination", + false, false) + +// Public interface to the GlobalDCEPass. +ModulePass *llvm::createGlobalDCEPass() { return new GlobalDCELegacyPass(); } + static cl::opt<bool> ClEnableVFE("enable-vfe", cl::Hidden, cl::init(true), cl::desc("Enable virtual function elimination")); diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index bdda498..d7edd12 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -1133,9 +1133,6 @@ static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, const DataLayout &DL, function_ref<TargetLibraryInfo &(Function &)> GetTLI) { - // Ignore no-op GEPs and bitcasts. - StoredOnceVal = StoredOnceVal->stripPointerCasts(); - // If we are dealing with a pointer global that is initialized to null and // only has one (non-null) value stored into it, then we can optimize any // users of the loaded value (often calls and loads) that would trap if the diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp index 6554377..88f5ca0 100644 --- a/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/llvm/lib/Transforms/IPO/Inliner.cpp @@ -459,6 +459,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, }), Calls.end()); + // Report inlining decision BEFORE deleting function contents, so we + // can still access e.g. the DebugLoc + Advice->recordInliningWithCalleeDeleted(); // Clear the body and queue the function itself for call graph // updating when we finish inlining. makeFunctionBodyUnreachable(Callee); @@ -470,9 +473,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, DeadFunctionsInComdats.push_back(&Callee); } } - if (CalleeWasDeleted) - Advice->recordInliningWithCalleeDeleted(); - else + if (!CalleeWasDeleted) Advice->recordInlining(); } diff --git a/llvm/lib/Transforms/IPO/ModuleInliner.cpp b/llvm/lib/Transforms/IPO/ModuleInliner.cpp index 844e275..1185e63 100644 --- a/llvm/lib/Transforms/IPO/ModuleInliner.cpp +++ b/llvm/lib/Transforms/IPO/ModuleInliner.cpp @@ -284,6 +284,10 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M, Calls->erase_if([&](const std::pair<CallBase *, int> &Call) { return Call.first->getCaller() == &Callee; }); + + // Report inlining decision BEFORE deleting function contents, so we + // can still access e.g. the DebugLoc + Advice->recordInliningWithCalleeDeleted(); // Clear the body and queue the function itself for deletion when we // finish inlining. // Note that after this point, it is an error to do anything other @@ -295,9 +299,7 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M, CalleeWasDeleted = true; } } - if (CalleeWasDeleted) - Advice->recordInliningWithCalleeDeleted(); - else + if (!CalleeWasDeleted) Advice->recordInlining(); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index d7971e8..6e46898 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -3740,6 +3740,82 @@ static Instruction *foldIntegerPackFromVector(Instruction &I, return CastInst::Create(Instruction::BitCast, MaskedVec, I.getType()); } +/// Match \p V as "lshr -> mask -> zext -> shl". +/// +/// \p Int is the underlying integer being extracted from. +/// \p Mask is a bitmask identifying which bits of the integer are being +/// extracted. \p Offset identifies which bit of the result \p V corresponds to +/// the least significant bit of \p Int +static bool matchZExtedSubInteger(Value *V, Value *&Int, APInt &Mask, + uint64_t &Offset, bool &IsShlNUW, + bool &IsShlNSW) { + Value *ShlOp0; + uint64_t ShlAmt = 0; + if (!match(V, m_OneUse(m_Shl(m_Value(ShlOp0), m_ConstantInt(ShlAmt))))) + return false; + + IsShlNUW = cast<BinaryOperator>(V)->hasNoUnsignedWrap(); + IsShlNSW = cast<BinaryOperator>(V)->hasNoSignedWrap(); + + Value *ZExtOp0; + if (!match(ShlOp0, m_OneUse(m_ZExt(m_Value(ZExtOp0))))) + return false; + + Value *MaskedOp0; + const APInt *ShiftedMaskConst = nullptr; + if (!match(ZExtOp0, m_CombineOr(m_OneUse(m_And(m_Value(MaskedOp0), + m_APInt(ShiftedMaskConst))), + m_Value(MaskedOp0)))) + return false; + + uint64_t LShrAmt = 0; + if (!match(MaskedOp0, + m_CombineOr(m_OneUse(m_LShr(m_Value(Int), m_ConstantInt(LShrAmt))), + m_Value(Int)))) + return false; + + if (LShrAmt > ShlAmt) + return false; + Offset = ShlAmt - LShrAmt; + + Mask = ShiftedMaskConst ? ShiftedMaskConst->shl(LShrAmt) + : APInt::getBitsSetFrom( + Int->getType()->getScalarSizeInBits(), LShrAmt); + + return true; +} + +/// Try to fold the join of two scalar integers whose bits are unpacked and +/// zexted from the same source integer. +static Value *foldIntegerRepackThroughZExt(Value *Lhs, Value *Rhs, + InstCombiner::BuilderTy &Builder) { + + Value *LhsInt, *RhsInt; + APInt LhsMask, RhsMask; + uint64_t LhsOffset, RhsOffset; + bool IsLhsShlNUW, IsLhsShlNSW, IsRhsShlNUW, IsRhsShlNSW; + if (!matchZExtedSubInteger(Lhs, LhsInt, LhsMask, LhsOffset, IsLhsShlNUW, + IsLhsShlNSW)) + return nullptr; + if (!matchZExtedSubInteger(Rhs, RhsInt, RhsMask, RhsOffset, IsRhsShlNUW, + IsRhsShlNSW)) + return nullptr; + if (LhsInt != RhsInt || LhsOffset != RhsOffset) + return nullptr; + + APInt Mask = LhsMask | RhsMask; + + Type *DestTy = Lhs->getType(); + Value *Res = Builder.CreateShl( + Builder.CreateZExt( + Builder.CreateAnd(LhsInt, Mask, LhsInt->getName() + ".mask"), DestTy, + LhsInt->getName() + ".zext"), + ConstantInt::get(DestTy, LhsOffset), "", IsLhsShlNUW && IsRhsShlNUW, + IsLhsShlNSW && IsRhsShlNSW); + Res->takeName(Lhs); + return Res; +} + // A decomposition of ((X & Mask) * Factor). The NUW / NSW bools // track these properities for preservation. Note that we can decompose // equivalent select form of this expression (e.g. (!(X & Mask) ? 0 : Mask * @@ -3841,6 +3917,8 @@ static Value *foldBitmaskMul(Value *Op0, Value *Op1, Value *InstCombinerImpl::foldDisjointOr(Value *LHS, Value *RHS) { if (Value *Res = foldBitmaskMul(LHS, RHS, Builder)) return Res; + if (Value *Res = foldIntegerRepackThroughZExt(LHS, RHS, Builder)) + return Res; return nullptr; } @@ -3973,7 +4051,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { /*NSW=*/true, /*NUW=*/true)) return R; - if (Value *Res = foldBitmaskMul(I.getOperand(0), I.getOperand(1), Builder)) + if (Value *Res = foldDisjointOr(I.getOperand(0), I.getOperand(1))) return replaceInstUsesWith(I, Res); if (Value *Res = reassociateDisjointOr(I.getOperand(0), I.getOperand(1))) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 47e017e..2433534 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -267,12 +267,10 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) { MI->getContext(), APInt::getSplat(Len * 8, FillC->getValue())); StoreInst *S = Builder.CreateStore(FillVal, Dest, MI->isVolatile()); S->copyMetadata(*MI, LLVMContext::MD_DIAssignID); - auto replaceOpForAssignmentMarkers = [FillC, FillVal](auto *DbgAssign) { + for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(S)) { if (llvm::is_contained(DbgAssign->location_ops(), FillC)) DbgAssign->replaceVariableLocationOp(FillC, FillVal); - }; - for_each(at::getAssignmentMarkers(S), replaceOpForAssignmentMarkers); - for_each(at::getDVRAssignmentMarkers(S), replaceOpForAssignmentMarkers); + } S->setAlignment(Alignment); if (MI->isAtomic()) @@ -1532,6 +1530,51 @@ static Instruction *foldBitOrderCrossLogicOp(Value *V, return nullptr; } +/// Helper to match idempotent binary intrinsics, namely, intrinsics where +/// `f(f(x, y), y) == f(x, y)` holds. +static bool isIdempotentBinaryIntrinsic(Intrinsic::ID IID) { + switch (IID) { + case Intrinsic::smax: + case Intrinsic::smin: + case Intrinsic::umax: + case Intrinsic::umin: + case Intrinsic::maximum: + case Intrinsic::minimum: + case Intrinsic::maximumnum: + case Intrinsic::minimumnum: + case Intrinsic::maxnum: + case Intrinsic::minnum: + return true; + default: + return false; + } +} + +/// Attempt to simplify value-accumulating recurrences of kind: +/// %umax.acc = phi i8 [ %umax, %backedge ], [ %a, %entry ] +/// %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b) +/// And let the idempotent binary intrinsic be hoisted, when the operands are +/// known to be loop-invariant. +static Value *foldIdempotentBinaryIntrinsicRecurrence(InstCombinerImpl &IC, + IntrinsicInst *II) { + PHINode *PN; + Value *Init, *OtherOp; + + // A binary intrinsic recurrence with loop-invariant operands is equivalent to + // `call @llvm.binary.intrinsic(Init, OtherOp)`. + auto IID = II->getIntrinsicID(); + if (!isIdempotentBinaryIntrinsic(IID) || + !matchSimpleBinaryIntrinsicRecurrence(II, PN, Init, OtherOp) || + !IC.getDominatorTree().dominates(OtherOp, PN)) + return nullptr; + + auto *InvariantBinaryInst = + IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp); + if (isa<FPMathOperator>(InvariantBinaryInst)) + cast<Instruction>(InvariantBinaryInst)->copyFastMathFlags(II); + return InvariantBinaryInst; +} + static Value *simplifyReductionOperand(Value *Arg, bool CanReorderLanes) { if (!CanReorderLanes) return nullptr; @@ -3912,6 +3955,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { if (Value *Reverse = foldReversedIntrinsicOperands(II)) return replaceInstUsesWith(*II, Reverse); + if (Value *Res = foldIdempotentBinaryIntrinsicRecurrence(*this, II)) + return replaceInstUsesWith(*II, Res); + // Some intrinsics (like experimental_gc_statepoint) can be used in invoke // context, so it is handled in visitCallBase and we should trigger it. return visitCallBase(*II); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index a43a6ee..801ac00 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1131,11 +1131,10 @@ static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, case Instruction::Shl: { // We can promote shl(x, cst) if we can promote x. Since shl overwrites the // upper bits we can reduce BitsToClear by the shift amount. - const APInt *Amt; - if (match(I->getOperand(1), m_APInt(Amt))) { + uint64_t ShiftAmt; + if (match(I->getOperand(1), m_ConstantInt(ShiftAmt))) { if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) return false; - uint64_t ShiftAmt = Amt->getZExtValue(); BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0; return true; } @@ -1144,11 +1143,11 @@ static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, case Instruction::LShr: { // We can promote lshr(x, cst) if we can promote x. This requires the // ultimate 'and' to clear out the high zero bits we're clearing out though. - const APInt *Amt; - if (match(I->getOperand(1), m_APInt(Amt))) { + uint64_t ShiftAmt; + if (match(I->getOperand(1), m_ConstantInt(ShiftAmt))) { if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) return false; - BitsToClear += Amt->getZExtValue(); + BitsToClear += ShiftAmt; if (BitsToClear > V->getType()->getScalarSizeInBits()) BitsToClear = V->getType()->getScalarSizeInBits(); return true; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index cf94d28..2386e7a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1320,6 +1320,35 @@ Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) { return nullptr; } +/// Fold icmp eq (num + mask) & ~mask, num +/// to +/// icmp eq (and num, mask), 0 +/// Where mask is a low bit mask. +Instruction *InstCombinerImpl::foldIsMultipleOfAPowerOfTwo(ICmpInst &Cmp) { + Value *Num; + CmpPredicate Pred; + const APInt *Mask, *Neg; + + if (!match(&Cmp, + m_c_ICmp(Pred, m_Value(Num), + m_OneUse(m_c_And(m_OneUse(m_c_Add(m_Deferred(Num), + m_LowBitMask(Mask))), + m_APInt(Neg)))))) + return nullptr; + + if (*Neg != ~*Mask) + return nullptr; + + if (!ICmpInst::isEquality(Pred)) + return nullptr; + + // Create new icmp eq (num & mask), 0 + auto *NewAnd = Builder.CreateAnd(Num, *Mask); + auto *Zero = Constant::getNullValue(Num->getType()); + + return new ICmpInst(Pred, NewAnd, Zero); +} + /// Fold icmp Pred X, C. /// TODO: This code structure does not make sense. The saturating add fold /// should be moved to some other helper and extended as noted below (it is also @@ -1521,11 +1550,11 @@ Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp, // trunc iN (ShOp >> ShAmtC) to i[N - ShAmtC] < 0 --> ShOp < 0 // trunc iN (ShOp >> ShAmtC) to i[N - ShAmtC] > -1 --> ShOp > -1 Value *ShOp; - const APInt *ShAmtC; + uint64_t ShAmt; bool TrueIfSigned; if (isSignBitCheck(Pred, C, TrueIfSigned) && - match(X, m_Shr(m_Value(ShOp), m_APInt(ShAmtC))) && - DstBits == SrcBits - ShAmtC->getZExtValue()) { + match(X, m_Shr(m_Value(ShOp), m_ConstantInt(ShAmt))) && + DstBits == SrcBits - ShAmt) { return TrueIfSigned ? new ICmpInst(ICmpInst::ICMP_SLT, ShOp, ConstantInt::getNullValue(SrcTy)) : new ICmpInst(ICmpInst::ICMP_SGT, ShOp, @@ -7644,6 +7673,9 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) { if (Instruction *Res = foldICmpUsingKnownBits(I)) return Res; + if (Instruction *Res = foldIsMultipleOfAPowerOfTwo(I)) + return Res; + // Test if the ICmpInst instruction is used exclusively by a select as // part of a minimum or maximum operation. If so, refrain from doing // any other folding. This helps out other analyses which understand diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index c67e27e..2340028 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -721,6 +721,7 @@ public: Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp); Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp); Instruction *foldICmpWithConstant(ICmpInst &Cmp); + Instruction *foldIsMultipleOfAPowerOfTwo(ICmpInst &Cmp); Instruction *foldICmpUsingBoolRange(ICmpInst &I); Instruction *foldICmpInstWithConstant(ICmpInst &Cmp); Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 0be1034..4b10586 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -737,6 +737,8 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) { LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U), ".unpack"); NewLoad->setAAMetadata(LI.getAAMetadata()); + // Copy invariant metadata from parent load. + NewLoad->copyMetadata(LI, LLVMContext::MD_invariant_load); return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue( PoisonValue::get(T), NewLoad, 0, Name)); } @@ -764,6 +766,8 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) { Name + ".unpack"); // Propagate AA metadata. It'll still be valid on the narrowed load. L->setAAMetadata(LI.getAAMetadata()); + // Copy invariant metadata from parent load. + L->copyMetadata(LI, LLVMContext::MD_invariant_load); V = IC.Builder.CreateInsertValue(V, L, i); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index fe0f308..b17cf17 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -3042,7 +3042,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *V = LHS; unsigned MaskElems = Mask.size(); auto *SrcTy = cast<FixedVectorType>(V->getType()); - unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedValue(); + unsigned VecBitWidth = DL.getTypeSizeInBits(SrcTy); unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType()); assert(SrcElemBitWidth && "vector elements must have a bitwidth"); unsigned SrcNumElems = SrcTy->getNumElements(); diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 8da65c5..50258af 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1211,23 +1211,19 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { return; if (!II.isLifetimeStartOrEnd()) return; - // Found lifetime intrinsic, add ASan instrumentation if necessary. - auto *Size = cast<ConstantInt>(II.getArgOperand(0)); - // If size argument is undefined, don't do anything. - if (Size->isMinusOne()) return; - // Check that size doesn't saturate uint64_t and can - // be stored in IntptrTy. - const uint64_t SizeValue = Size->getValue().getLimitedValue(); - if (SizeValue == ~0ULL || - !ConstantInt::isValueValidForType(IntptrTy, SizeValue)) - return; // Find alloca instruction that corresponds to llvm.lifetime argument. - AllocaInst *AI = dyn_cast<AllocaInst>(II.getArgOperand(1)); + AllocaInst *AI = dyn_cast<AllocaInst>(II.getArgOperand(0)); // We're interested only in allocas we can handle. if (!AI || !ASan.isInterestingAlloca(*AI)) return; + + std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout()); + // Check that size is known and can be stored in IntptrTy. + if (!Size || !ConstantInt::isValueValidForType(IntptrTy, *Size)) + return; + bool DoPoison = (ID == Intrinsic::lifetime_end); - AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison}; + AllocaPoisonCall APC = {&II, AI, *Size, DoPoison}; if (AI->isStaticAlloca()) StaticAllocaPoisonCallVec.push_back(APC); else if (ClInstrumentDynamicAllocas) diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index bcb90d6..fc34d14 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -1469,22 +1469,6 @@ void HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo, size_t Size = memtag::getAllocaSizeInBytes(*AI); size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment()); - auto HandleLifetime = [&](IntrinsicInst *II) { - // Set the lifetime intrinsic to cover the whole alloca. This reduces the - // set of assumptions we need to make about the lifetime. Without this we - // would need to ensure that we can track the lifetime pointer to a - // constant offset from the alloca, and would still need to change the - // size to include the extra alignment we use for the untagging to make - // the size consistent. - // - // The check for standard lifetime below makes sure that we have exactly - // one set of start / end in any execution (i.e. the ends are not - // reachable from each other), so this will not cause any problems. - II->setArgOperand(0, ConstantInt::get(Int64Ty, AlignedSize)); - }; - llvm::for_each(Info.LifetimeStart, HandleLifetime); - llvm::for_each(Info.LifetimeEnd, HandleLifetime); - AI->replaceUsesWithIf(Replacement, [AILong](const Use &U) { auto *User = U.getUser(); return User != AILong && !isa<LifetimeIntrinsic>(User); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 7d3c940..948e2c6 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2690,6 +2690,54 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { SC.Done(&I); } + // Perform a bitwise OR on the horizontal pairs (or other specified grouping) + // of elements. + // + // For example, suppose we have: + // VectorA: <a1, a2, a3, a4, a5, a6> + // VectorB: <b1, b2, b3, b4, b5, b6> + // ReductionFactor: 3. + // The output would be: + // <a1|a2|a3, a4|a5|a6, b1|b2|b3, b4|b5|b6> + // + // This is convenient for instrumenting horizontal add/sub. + // For bitwise OR on "vertical" pairs, see maybeHandleSimpleNomemIntrinsic(). + Value *horizontalReduce(IntrinsicInst &I, unsigned ReductionFactor, + Value *VectorA, Value *VectorB) { + assert(isa<FixedVectorType>(VectorA->getType())); + unsigned TotalNumElems = + cast<FixedVectorType>(VectorA->getType())->getNumElements(); + + if (VectorB) { + assert(VectorA->getType() == VectorB->getType()); + TotalNumElems = TotalNumElems * 2; + } + + assert(TotalNumElems % ReductionFactor == 0); + + Value *Or = nullptr; + + IRBuilder<> IRB(&I); + for (unsigned i = 0; i < ReductionFactor; i++) { + SmallVector<int, 16> Mask; + for (unsigned X = 0; X < TotalNumElems; X += ReductionFactor) + Mask.push_back(X + i); + + Value *Masked; + if (VectorB) + Masked = IRB.CreateShuffleVector(VectorA, VectorB, Mask); + else + Masked = IRB.CreateShuffleVector(VectorA, Mask); + + if (Or) + Or = IRB.CreateOr(Or, Masked); + else + Or = Masked; + } + + return Or; + } + /// Propagate shadow for 1- or 2-vector intrinsics that combine adjacent /// fields. /// @@ -2701,7 +2749,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { assert(I.getType()->isVectorTy()); assert(I.getArgOperand(0)->getType()->isVectorTy()); - FixedVectorType *ParamType = + [[maybe_unused]] FixedVectorType *ParamType = cast<FixedVectorType>(I.getArgOperand(0)->getType()); assert((I.arg_size() != 2) || (ParamType == cast<FixedVectorType>(I.getArgOperand(1)->getType()))); @@ -2711,31 +2759,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { 2 * ReturnType->getNumElements()); IRBuilder<> IRB(&I); - unsigned Width = ParamType->getNumElements() * I.arg_size(); // Horizontal OR of shadow - SmallVector<int, 8> EvenMask; - SmallVector<int, 8> OddMask; - for (unsigned X = 0; X < Width; X += 2) { - EvenMask.push_back(X); - OddMask.push_back(X + 1); - } - Value *FirstArgShadow = getShadow(&I, 0); - Value *EvenShadow; - Value *OddShadow; - if (I.arg_size() == 2) { - Value *SecondArgShadow = getShadow(&I, 1); - EvenShadow = - IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, EvenMask); - OddShadow = - IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, OddMask); - } else { - EvenShadow = IRB.CreateShuffleVector(FirstArgShadow, EvenMask); - OddShadow = IRB.CreateShuffleVector(FirstArgShadow, OddMask); - } + Value *SecondArgShadow = nullptr; + if (I.arg_size() == 2) + SecondArgShadow = getShadow(&I, 1); + + Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, FirstArgShadow, + SecondArgShadow); - Value *OrShadow = IRB.CreateOr(EvenShadow, OddShadow); OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I)); setShadow(&I, OrShadow); @@ -2768,23 +2801,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(&I); - unsigned TotalNumElems = ParamType->getNumElements() * I.arg_size(); FixedVectorType *ReinterpretShadowTy = nullptr; assert(isAligned(Align(ReinterpretElemWidth), ParamType->getPrimitiveSizeInBits())); ReinterpretShadowTy = FixedVectorType::get( IRB.getIntNTy(ReinterpretElemWidth), ParamType->getPrimitiveSizeInBits() / ReinterpretElemWidth); - TotalNumElems = ReinterpretShadowTy->getNumElements() * I.arg_size(); // Horizontal OR of shadow - SmallVector<int, 8> EvenMask; - SmallVector<int, 8> OddMask; - for (unsigned X = 0; X < TotalNumElems - 1; X += 2) { - EvenMask.push_back(X); - OddMask.push_back(X + 1); - } - Value *FirstArgShadow = getShadow(&I, 0); FirstArgShadow = IRB.CreateBitCast(FirstArgShadow, ReinterpretShadowTy); @@ -2796,22 +2820,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Align(2), cast<FixedVectorType>(FirstArgShadow->getType())->getNumElements())); - Value *EvenShadow; - Value *OddShadow; + Value *SecondArgShadow = nullptr; if (I.arg_size() == 2) { - Value *SecondArgShadow = getShadow(&I, 1); + SecondArgShadow = getShadow(&I, 1); SecondArgShadow = IRB.CreateBitCast(SecondArgShadow, ReinterpretShadowTy); - - EvenShadow = - IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, EvenMask); - OddShadow = - IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, OddMask); - } else { - EvenShadow = IRB.CreateShuffleVector(FirstArgShadow, EvenMask); - OddShadow = IRB.CreateShuffleVector(FirstArgShadow, OddMask); } - Value *OrShadow = IRB.CreateOr(EvenShadow, OddShadow); + Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, FirstArgShadow, + SecondArgShadow); + OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I)); setShadow(&I, OrShadow); @@ -3219,7 +3236,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// Caller guarantees that this intrinsic does not access memory. /// /// TODO: "horizontal"/"pairwise" intrinsics are often incorrectly matched by - /// by this handler. + /// by this handler. See horizontalReduce(). + /// + /// TODO: permutation intrinsics are also often incorrectly matched. [[maybe_unused]] bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I, unsigned int trailingFlags) { @@ -3301,7 +3320,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void handleLifetimeStart(IntrinsicInst &I) { if (!PoisonStack) return; - AllocaInst *AI = dyn_cast<AllocaInst>(I.getArgOperand(1)); + AllocaInst *AI = dyn_cast<AllocaInst>(I.getArgOperand(0)); if (AI) LifetimeStartList.push_back(std::make_pair(&I, AI)); } @@ -3624,9 +3643,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOriginForNaryOp(I); } - // Get an MMX-sized vector type. - Type *getMMXVectorTy(unsigned EltSizeInBits) { - const unsigned X86_MMXSizeInBits = 64; + // Get an MMX-sized (64-bit) vector type, or optionally, other sized + // vectors. + Type *getMMXVectorTy(unsigned EltSizeInBits, + unsigned X86_MMXSizeInBits = 64) { assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 && "Illegal MMX vector element size"); return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits), @@ -3826,20 +3846,133 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOriginForNaryOp(I); } - // Instrument multiply-add intrinsic. - void handleVectorPmaddIntrinsic(IntrinsicInst &I, - unsigned MMXEltSizeInBits = 0) { - Type *ResTy = - MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType(); + // Instrument multiply-add(-accumulate)? intrinsics. + // + // e.g., Two operands: + // <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b) + // + // Two operands which require an EltSizeInBits override: + // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b) + // + // Three operands: + // <4 x i32> @llvm.x86.avx512.vpdpbusd.128 + // (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b) + // (this is equivalent to multiply-add on %a and %b, followed by + // adding/"accumulating" %s. "Accumulation" stores the result in one + // of the source registers, but this accumulate vs. add distinction + // is lost when dealing with LLVM intrinsics.) + void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor, + unsigned EltSizeInBits = 0) { IRBuilder<> IRB(&I); - auto *Shadow0 = getShadow(&I, 0); - auto *Shadow1 = getShadow(&I, 1); - Value *S = IRB.CreateOr(Shadow0, Shadow1); - S = IRB.CreateBitCast(S, ResTy); - S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)), - ResTy); - S = IRB.CreateBitCast(S, getShadowTy(&I)); - setShadow(&I, S); + + [[maybe_unused]] FixedVectorType *ReturnType = + cast<FixedVectorType>(I.getType()); + assert(isa<FixedVectorType>(ReturnType)); + + // Vectors A and B, and shadows + Value *Va = nullptr; + Value *Vb = nullptr; + Value *Sa = nullptr; + Value *Sb = nullptr; + + assert(I.arg_size() == 2 || I.arg_size() == 3); + if (I.arg_size() == 2) { + Va = I.getOperand(0); + Vb = I.getOperand(1); + + Sa = getShadow(&I, 0); + Sb = getShadow(&I, 1); + } else if (I.arg_size() == 3) { + // Operand 0 is the accumulator. We will deal with that below. + Va = I.getOperand(1); + Vb = I.getOperand(2); + + Sa = getShadow(&I, 1); + Sb = getShadow(&I, 2); + } + + FixedVectorType *ParamType = cast<FixedVectorType>(Va->getType()); + assert(ParamType == Vb->getType()); + + assert(ParamType->getPrimitiveSizeInBits() == + ReturnType->getPrimitiveSizeInBits()); + + if (I.arg_size() == 3) { + assert(ParamType == ReturnType); + assert(ParamType == I.getArgOperand(0)->getType()); + } + + FixedVectorType *ImplicitReturnType = ReturnType; + // Step 1: instrument multiplication of corresponding vector elements + if (EltSizeInBits) { + ImplicitReturnType = cast<FixedVectorType>(getMMXVectorTy( + EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits())); + ParamType = cast<FixedVectorType>( + getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits())); + + Va = IRB.CreateBitCast(Va, ParamType); + Vb = IRB.CreateBitCast(Vb, ParamType); + + Sa = IRB.CreateBitCast(Sa, getShadowTy(ParamType)); + Sb = IRB.CreateBitCast(Sb, getShadowTy(ParamType)); + } else { + assert(ParamType->getNumElements() == + ReturnType->getNumElements() * ReductionFactor); + } + + // Multiplying an *initialized* zero by an uninitialized element results in + // an initialized zero element. + // + // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value + // results in an unpoisoned value. We can therefore adapt the visitAnd() + // instrumentation: + // OutShadow = (SaNonZero & SbNonZero) + // | (VaNonZero & SbNonZero) + // | (SaNonZero & VbNonZero) + // where non-zero is checked on a per-element basis (not per bit). + Value *SZero = Constant::getNullValue(Va->getType()); + Value *VZero = Constant::getNullValue(Sa->getType()); + Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero); + Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero); + Value *VaNonZero = IRB.CreateICmpNE(Va, VZero); + Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero); + + Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); + Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); + Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); + + // Each element of the vector is represented by a single bit (poisoned or + // not) e.g., <8 x i1>. + Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + + // Extend <8 x i1> to <8 x i16>. + // (The real pmadd intrinsic would have computed intermediate values of + // <8 x i32>, but that is irrelevant for our shadow purposes because we + // consider each element to be either fully initialized or fully + // uninitialized.) + And = IRB.CreateSExt(And, Sa->getType()); + + // Step 2: instrument horizontal add + // We don't need bit-precise horizontalReduce because we only want to check + // if each pair of elements is fully zero. + // Cast to <4 x i32>. + Value *Horizontal = IRB.CreateBitCast(And, ImplicitReturnType); + + // Compute <4 x i1>, then extend back to <4 x i32>. + Value *OutShadow = IRB.CreateSExt( + IRB.CreateICmpNE(Horizontal, + Constant::getNullValue(Horizontal->getType())), + ImplicitReturnType); + + // Cast it back to the required fake return type (<1 x i64>). + if (EltSizeInBits) + OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I)); + + // Step 3 (if applicable): instrument accumulator + if (I.arg_size() == 3) + OutShadow = IRB.CreateOr(OutShadow, getShadow(&I, 0)); + + setShadow(&I, OutShadow); setOriginForNaryOp(I); } @@ -5374,21 +5507,185 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { handleVectorSadIntrinsic(I); break; + // Multiply and Add Packed Words + // < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) + // < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) + // <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>) + // + // Multiply and Add Packed Signed and Unsigned Bytes + // < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) + // <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) + // <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>) + // + // These intrinsics are auto-upgraded into non-masked forms: + // < 4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128 + // (<8 x i16>, <8 x i16>, <4 x i32>, i8) + // < 8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256 + // (<16 x i16>, <16 x i16>, <8 x i32>, i8) + // <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512 + // (<32 x i16>, <32 x i16>, <16 x i32>, i16) + // < 8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128 + // (<16 x i8>, <16 x i8>, <8 x i16>, i8) + // <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256 + // (<32 x i8>, <32 x i8>, <16 x i16>, i16) + // <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512 + // (<64 x i8>, <64 x i8>, <32 x i16>, i32) case Intrinsic::x86_sse2_pmadd_wd: case Intrinsic::x86_avx2_pmadd_wd: + case Intrinsic::x86_avx512_pmaddw_d_512: case Intrinsic::x86_ssse3_pmadd_ub_sw_128: case Intrinsic::x86_avx2_pmadd_ub_sw: - handleVectorPmaddIntrinsic(I); + case Intrinsic::x86_avx512_pmaddubs_w_512: + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2); break; + // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) case Intrinsic::x86_ssse3_pmadd_ub_sw: - handleVectorPmaddIntrinsic(I, 8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8); break; + // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) case Intrinsic::x86_mmx_pmadd_wd: - handleVectorPmaddIntrinsic(I, 16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); break; + // AVX Vector Neural Network Instructions: bytes + // + // Multiply and Add Packed Signed and Unsigned Bytes + // < 4 x i32> @llvm.x86.avx512.vpdpbusd.128 + // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // < 8 x i32> @llvm.x86.avx512.vpdpbusd.256 + // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // <16 x i32> @llvm.x86.avx512.vpdpbusd.512 + // (<16 x i32>, <16 x i32>, <16 x i32>) + // + // Multiply and Add Unsigned and Signed Bytes With Saturation + // < 4 x i32> @llvm.x86.avx512.vpdpbusds.128 + // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // < 8 x i32> @llvm.x86.avx512.vpdpbusds.256 + // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // <16 x i32> @llvm.x86.avx512.vpdpbusds.512 + // (<16 x i32>, <16 x i32>, <16 x i32>) + // + // < 4 x i32> @llvm.x86.avx2.vpdpbssd.128 + // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // < 8 x i32> @llvm.x86.avx2.vpdpbssd.256 + // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // + // < 4 x i32> @llvm.x86.avx2.vpdpbssds.128 + // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // < 8 x i32> @llvm.x86.avx2.vpdpbssds.256 + // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // + // <16 x i32> @llvm.x86.avx10.vpdpbssd.512 + // (<16 x i32>, <16 x i32>, <16 x i32>) + // <16 x i32> @llvm.x86.avx10.vpdpbssds.512 + // (<16 x i32>, <16 x i32>, <16 x i32>) + // + // These intrinsics are auto-upgraded into non-masked forms: + // <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128 + // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128 + // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256 + // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256 + // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512 + // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512 + // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // + // <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128 + // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128 + // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256 + // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256 + // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512 + // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512 + // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + case Intrinsic::x86_avx512_vpdpbusd_128: + case Intrinsic::x86_avx512_vpdpbusd_256: + case Intrinsic::x86_avx512_vpdpbusd_512: + case Intrinsic::x86_avx512_vpdpbusds_128: + case Intrinsic::x86_avx512_vpdpbusds_256: + case Intrinsic::x86_avx512_vpdpbusds_512: + case Intrinsic::x86_avx2_vpdpbssd_128: + case Intrinsic::x86_avx2_vpdpbssd_256: + case Intrinsic::x86_avx2_vpdpbssds_128: + case Intrinsic::x86_avx2_vpdpbssds_256: + case Intrinsic::x86_avx10_vpdpbssd_512: + case Intrinsic::x86_avx10_vpdpbssds_512: + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8); + break; + + // AVX Vector Neural Network Instructions: words + // + // Multiply and Add Signed Word Integers + // < 4 x i32> @llvm.x86.avx512.vpdpwssd.128 + // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // < 8 x i32> @llvm.x86.avx512.vpdpwssd.256 + // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // <16 x i32> @llvm.x86.avx512.vpdpwssd.512 + // (<16 x i32>, <16 x i32>, <16 x i32>) + // + // Multiply and Add Signed Word Integers With Saturation + // < 4 x i32> @llvm.x86.avx512.vpdpwssds.128 + // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // < 8 x i32> @llvm.x86.avx512.vpdpwssds.256 + // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // <16 x i32> @llvm.x86.avx512.vpdpwssds.512 + // (<16 x i32>, <16 x i32>, <16 x i32>) + // + // These intrinsics are auto-upgraded into non-masked forms: + // <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128 + // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128 + // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256 + // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256 + // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512 + // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512 + // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // + // <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128 + // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128 + // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256 + // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256 + // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512 + // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512 + // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + case Intrinsic::x86_avx512_vpdpwssd_128: + case Intrinsic::x86_avx512_vpdpwssd_256: + case Intrinsic::x86_avx512_vpdpwssd_512: + case Intrinsic::x86_avx512_vpdpwssds_128: + case Intrinsic::x86_avx512_vpdpwssds_256: + case Intrinsic::x86_avx512_vpdpwssds_512: + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); + break; + + // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single + // Precision + // <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128 + // (<4 x float>, <8 x bfloat>, <8 x bfloat>) + // <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256 + // (<8 x float>, <16 x bfloat>, <16 x bfloat>) + // <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512 + // (<16 x float>, <32 x bfloat>, <32 x bfloat>) + // handleVectorPmaddIntrinsic() currently only handles integer types. + case Intrinsic::x86_sse_cmp_ss: case Intrinsic::x86_sse2_cmp_sd: case Intrinsic::x86_sse_comieq_ss: @@ -5603,6 +5900,26 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { handleAVXVpermi2var(I); break; + // Packed Shuffle + // llvm.x86.sse.pshuf.w(<1 x i64>, i8) + // llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) + // llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) + // llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) + // llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>) + // + // The following intrinsics are auto-upgraded: + // llvm.x86.sse2.pshuf.d(<4 x i32>, i8) + // llvm.x86.sse2.gpshufh.w(<8 x i16>, i8) + // llvm.x86.sse2.pshufl.w(<8 x i16>, i8) + case Intrinsic::x86_avx2_pshuf_b: + case Intrinsic::x86_sse_pshuf_w: + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_ssse3_pshuf_b: + case Intrinsic::x86_avx512_pshuf_b_512: + handleIntrinsicByApplyingToShadow(I, I.getIntrinsicID(), + /*trailingVerbatimArgs=*/1); + break; + case Intrinsic::x86_avx512_mask_cvtps2dq_512: { handleAVX512VectorConvertFPToInt(I); break; diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp index 6128581..f5b6686 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp @@ -58,7 +58,7 @@ void assignProfileData(Function &F, ArrayRef<uint64_t> RawCounters) { uint64_t TrueCount, FalseCount = 0; if (!PA.getSelectInstrProfile(*SI, TrueCount, FalseCount)) continue; - setProfMetadata(F.getParent(), SI, {TrueCount, FalseCount}, + setProfMetadata(SI, {TrueCount, FalseCount}, std::max(TrueCount, FalseCount)); } if (succ_size(&BB) < 2) @@ -67,7 +67,7 @@ void assignProfileData(Function &F, ArrayRef<uint64_t> RawCounters) { if (!PA.getOutgoingBranchWeights(BB, ProfileHolder, MaxCount)) continue; assert(MaxCount > 0); - setProfMetadata(F.getParent(), BB.getTerminator(), ProfileHolder, MaxCount); + setProfMetadata(BB.getTerminator(), ProfileHolder, MaxCount); } } diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 6f06a26..d9e850e 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1727,7 +1727,7 @@ void PGOUseFunc::setBranchWeights() { } if (MaxCount) - setProfMetadata(M, TI, EdgeCounts, MaxCount); + setProfMetadata(TI, EdgeCounts, MaxCount); else { // A zero MaxCount can come about when we have a BB with a positive // count, and whose successor blocks all have 0 count. This can happen @@ -1801,7 +1801,7 @@ void SelectInstVisitor::annotateOneSelectInst(SelectInst &SI) { SCounts[1] = (TotalCount > SCounts[0] ? TotalCount - SCounts[0] : 0); uint64_t MaxCount = std::max(SCounts[0], SCounts[1]); if (MaxCount) - setProfMetadata(F.getParent(), &SI, SCounts, MaxCount); + setProfMetadata(&SI, SCounts, MaxCount); } void SelectInstVisitor::visitSelectInst(SelectInst &SI) { @@ -2407,13 +2407,9 @@ static std::string getSimpleNodeName(const BasicBlock *Node) { return SimpleNodeName; } -void llvm::setProfMetadata(Module *M, Instruction *TI, - ArrayRef<uint64_t> EdgeCounts, uint64_t MaxCount) { - assert(MaxCount > 0 && "Bad max count"); - uint64_t Scale = calculateCountScale(MaxCount); - SmallVector<unsigned, 4> Weights; - for (const auto &ECI : EdgeCounts) - Weights.push_back(scaleBranchCount(ECI, Scale)); +void llvm::setProfMetadata(Instruction *TI, ArrayRef<uint64_t> EdgeCounts, + uint64_t MaxCount) { + auto Weights = downscaleWeights(EdgeCounts, MaxCount); LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W : Weights) { @@ -2434,7 +2430,7 @@ void llvm::setProfMetadata(Module *M, Instruction *TI, uint64_t TotalCount = std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), (uint64_t)0, [](uint64_t c1, uint64_t c2) { return c1 + c2; }); - Scale = calculateCountScale(WSum); + uint64_t Scale = calculateCountScale(WSum); BranchProbability BP(scaleBranchCount(Weights[0], Scale), scaleBranchCount(WSum, Scale)); std::string BranchProbStr; diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp index ce1d9f1..343bec3 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp @@ -432,7 +432,7 @@ bool MemOPSizeOpt::perform(MemOp MO) { Updates.clear(); if (MaxCount) - setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); + setProfMetadata(SI, CaseCounts, MaxCount); LLVM_DEBUG(dbgs() << *BB << "\n"); LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp index 4edf25c..9471ae3 100644 --- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp @@ -818,12 +818,12 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase, } } } else if (auto *II = dyn_cast<LifetimeIntrinsic>(I)) { - auto *AI = dyn_cast<AllocaInst>(II->getArgOperand(1)); + auto *AI = dyn_cast<AllocaInst>(II->getArgOperand(0)); if (!AI) return false; Size = GetAllocaSize(AI); - Dest = II->getArgOperand(1); + Dest = II->getArgOperand(0); } else if (auto *AI = dyn_cast<AllocaInst>(I)) { // We need to clear the types for new stack allocations (or else we might // read stale type information from a previous function execution). diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt index 84a5b02..765059d 100644 --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -95,6 +95,7 @@ add_llvm_component_library(LLVMScalarOpts Analysis Core InstCombine + ProfileData Support TransformUtils ) diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index a7ba54f..ac59ae1 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -447,7 +447,7 @@ private: /// Also, collect select instructions to unfold. bool isCandidate(const SwitchInst *SI) { std::deque<std::pair<Value *, BasicBlock *>> Q; - SmallSet<Value *, 16> SeenValues; + SmallPtrSet<Value *, 16> SeenValues; SelectInsts.clear(); Value *SICond = SI->getCondition(); @@ -511,7 +511,7 @@ private: void addToQueue(Value *Val, BasicBlock *BB, std::deque<std::pair<Value *, BasicBlock *>> &Q, - SmallSet<Value *, 16> &SeenValues) { + SmallPtrSet<Value *, 16> &SeenValues) { if (SeenValues.insert(Val).second) Q.push_back({Val, BB}); } @@ -582,17 +582,15 @@ struct AllSwitchPaths { VisitedBlocks VB; // Get paths from the determinator BBs to SwitchPhiDefBB std::vector<ThreadingPath> PathsToPhiDef = - getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths); + getPathsFromStateDefMap(StateDef, SwitchPhi, VB); if (SwitchPhiDefBB == SwitchBlock) { TPaths = std::move(PathsToPhiDef); return; } - assert(MaxNumPaths >= PathsToPhiDef.size()); - auto PathsLimit = MaxNumPaths / PathsToPhiDef.size(); // Find and append paths from SwitchPhiDefBB to SwitchBlock. PathsType PathsToSwitchBB = - paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit); + paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1); if (PathsToSwitchBB.empty()) return; @@ -613,16 +611,13 @@ private: typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap; std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef, PHINode *Phi, - VisitedBlocks &VB, - unsigned PathsLimit) { + VisitedBlocks &VB) { std::vector<ThreadingPath> Res; auto *PhiBB = Phi->getParent(); VB.insert(PhiBB); VisitedBlocks UniqueBlocks; for (auto *IncomingBB : Phi->blocks()) { - if (Res.size() >= PathsLimit) - break; if (!UniqueBlocks.insert(IncomingBB).second) continue; if (!SwitchOuterLoop->contains(IncomingBB)) @@ -658,9 +653,8 @@ private: // Direct predecessor, just add to the path. if (IncomingPhiDefBB == IncomingBB) { - assert(PathsLimit > Res.size()); - std::vector<ThreadingPath> PredPaths = getPathsFromStateDefMap( - StateDef, IncomingPhi, VB, PathsLimit - Res.size()); + std::vector<ThreadingPath> PredPaths = + getPathsFromStateDefMap(StateDef, IncomingPhi, VB); for (ThreadingPath &Path : PredPaths) { Path.push_back(PhiBB); Res.push_back(std::move(Path)); @@ -673,17 +667,13 @@ private: continue; PathsType IntermediatePaths; - assert(PathsLimit > Res.size()); - auto InterPathLimit = PathsLimit - Res.size(); - IntermediatePaths = paths(IncomingPhiDefBB, IncomingBB, VB, - /* PathDepth = */ 1, InterPathLimit); + IntermediatePaths = + paths(IncomingPhiDefBB, IncomingBB, VB, /* PathDepth = */ 1); if (IntermediatePaths.empty()) continue; - assert(InterPathLimit >= IntermediatePaths.size()); - auto PredPathLimit = InterPathLimit / IntermediatePaths.size(); std::vector<ThreadingPath> PredPaths = - getPathsFromStateDefMap(StateDef, IncomingPhi, VB, PredPathLimit); + getPathsFromStateDefMap(StateDef, IncomingPhi, VB); for (const ThreadingPath &Path : PredPaths) { for (const PathType &IPath : IntermediatePaths) { ThreadingPath NewPath(Path); @@ -698,7 +688,7 @@ private: } PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited, - unsigned PathDepth, unsigned PathsLimit) { + unsigned PathDepth) { PathsType Res; // Stop exploring paths after visiting MaxPathLength blocks @@ -723,10 +713,8 @@ private: // Some blocks have multiple edges to the same successor, and this set // is used to prevent a duplicate path from being generated - SmallSet<BasicBlock *, 4> Successors; + SmallPtrSet<BasicBlock *, 4> Successors; for (BasicBlock *Succ : successors(BB)) { - if (Res.size() >= PathsLimit) - break; if (!Successors.insert(Succ).second) continue; @@ -748,12 +736,14 @@ private: // coverage and compile time. if (LI->getLoopFor(Succ) != CurrLoop) continue; - assert(PathsLimit > Res.size()); - PathsType SuccPaths = - paths(Succ, ToBB, Visited, PathDepth + 1, PathsLimit - Res.size()); + + PathsType SuccPaths = paths(Succ, ToBB, Visited, PathDepth + 1); for (PathType &Path : SuccPaths) { Path.push_front(BB); Res.push_back(Path); + if (Res.size() >= MaxNumPaths) { + return Res; + } } } // This block could now be visited again from a different predecessor. Note @@ -772,7 +762,7 @@ private: SmallVector<PHINode *, 8> Stack; Stack.push_back(FirstDef); - SmallSet<Value *, 16> SeenValues; + SmallPtrSet<Value *, 16> SeenValues; while (!Stack.empty()) { PHINode *CurPhi = Stack.pop_back_val(); @@ -965,7 +955,7 @@ private: DuplicateBlockMap DuplicateMap; DefMap NewDefs; - SmallSet<BasicBlock *, 16> BlocksToClean; + SmallPtrSet<BasicBlock *, 16> BlocksToClean; BlocksToClean.insert_range(successors(SwitchBlock)); for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { @@ -994,7 +984,7 @@ private: /// the predecessors, and phis in the successor blocks. void createExitPath(DefMap &NewDefs, ThreadingPath &Path, DuplicateBlockMap &DuplicateMap, - SmallSet<BasicBlock *, 16> &BlocksToClean, + SmallPtrSet<BasicBlock *, 16> &BlocksToClean, DomTreeUpdater *DTU) { APInt NextState = Path.getExitValue(); const BasicBlock *Determinator = Path.getDeterminatorBB(); diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 9b87180..37004b9 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -38,6 +38,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" @@ -69,6 +70,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -543,15 +545,8 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest, }; // Insert an unlinked dbg.assign intrinsic for the dead fragment after each - // overlapping dbg.assign intrinsic. The loop invalidates the iterators - // returned by getAssignmentMarkers so save a copy of the markers to iterate - // over. - auto LinkedRange = at::getAssignmentMarkers(Inst); - SmallVector<DbgVariableRecord *> LinkedDVRAssigns = - at::getDVRAssignmentMarkers(Inst); - SmallVector<DbgAssignIntrinsic *> Linked(LinkedRange.begin(), - LinkedRange.end()); - auto InsertAssignForOverlap = [&](auto *Assign) { + // overlapping dbg.assign intrinsic. + for (DbgVariableRecord *Assign : at::getDVRAssignmentMarkers(Inst)) { std::optional<DIExpression::FragmentInfo> NewFragment; if (!at::calculateFragmentIntersect(DL, OriginalDest, DeadSliceOffsetInBits, DeadSliceSizeInBits, Assign, @@ -561,11 +556,11 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest, // cautious and unlink the whole assignment from the store. Assign->setKillAddress(); Assign->setAssignId(GetDeadLink()); - return; + continue; } // No intersect. if (NewFragment->SizeInBits == 0) - return; + continue; // Fragments overlap: insert a new dbg.assign for this dead part. auto *NewAssign = static_cast<decltype(Assign)>(Assign->clone()); @@ -574,9 +569,7 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest, if (NewFragment) SetDeadFragExpr(NewAssign, *NewFragment); NewAssign->setKillAddress(); - }; - for_each(Linked, InsertAssignForOverlap); - for_each(LinkedDVRAssigns, InsertAssignForOverlap); + } } /// Update the attributes given that a memory access is updated (the @@ -1363,7 +1356,7 @@ struct DSEState { if (auto *CB = dyn_cast<CallBase>(I)) { if (CB->getIntrinsicID() == Intrinsic::lifetime_end) return { - std::make_pair(MemoryLocation::getForArgument(CB, 1, &TLI), false)}; + std::make_pair(MemoryLocation::getForArgument(CB, 0, &TLI), false)}; if (Value *FreedOp = getFreedOperand(CB, &TLI)) return {std::make_pair(MemoryLocation::getAfter(FreedOp), true)}; } @@ -2666,3 +2659,79 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) { PA.preserve<LoopAnalysis>(); return PA; } + +namespace { + +/// A legacy pass for the legacy pass manager that wraps \c DSEPass. +class DSELegacyPass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + + DSELegacyPass() : FunctionPass(ID) { + initializeDSELegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + const TargetLibraryInfo &TLI = + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA(); + PostDominatorTree &PDT = + getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); + +#ifdef LLVM_ENABLE_STATS + if (AreStatisticsEnabled()) + for (auto &I : instructions(F)) + NumRemainingStores += isa<StoreInst>(&I); +#endif + + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<PostDominatorTreeWrapperPass>(); + AU.addRequired<MemorySSAWrapperPass>(); + AU.addPreserved<PostDominatorTreeWrapperPass>(); + AU.addPreserved<MemorySSAWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + } +}; + +} // end anonymous namespace + +char DSELegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false, + false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false, + false) + +namespace llvm { +LLVM_ABI FunctionPass *createDeadStoreEliminationPass() { + return new DSELegacyPass(); +} +} // namespace llvm diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 7704e49..4baa3b3 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -978,7 +978,7 @@ static bool IsValueFullyAvailableInBlock( unsigned NumNewNewSpeculativelyAvailableBBs = 0; #ifndef NDEBUG - SmallSet<BasicBlock *, 32> NewSpeculativelyAvailableBBs; + SmallPtrSet<BasicBlock *, 32> NewSpeculativelyAvailableBBs; SmallVector<BasicBlock *, 32> AvailableBBs; #endif @@ -1222,7 +1222,7 @@ static bool liesBetween(const Instruction *From, Instruction *Between, const Instruction *To, const DominatorTree *DT) { if (From->getParent() == Between->getParent()) return DT->dominates(From, Between); - SmallSet<BasicBlock *, 1> Exclusion; + SmallPtrSet<BasicBlock *, 1> Exclusion; Exclusion.insert(Between->getParent()); return !isPotentiallyReachable(From, To, &Exclusion, DT); } diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp index 3ba5b79..d99f1eb 100644 --- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -642,9 +642,9 @@ Value *GuardWideningImpl::freezeAndPush(Value *Orig, return FI; } - SmallSet<Value *, 16> Visited; + SmallPtrSet<Value *, 16> Visited; SmallVector<Value *, 16> Worklist; - SmallSet<Instruction *, 16> DropPoisonFlags; + SmallPtrSet<Instruction *, 16> DropPoisonFlags; SmallVector<Value *, 16> NeedFreeze; DenseMap<Value *, FreezeInst *> CacheOfFreezes; diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 334c911..6720cb1 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1613,7 +1613,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { if (CurrMaxExit == MaxBECount) SkipLastIter = true; }; - SmallSet<const SCEV *, 8> DominatingExactExitCounts; + SmallPtrSet<const SCEV *, 8> DominatingExactExitCounts; for (BasicBlock *ExitingBB : ExitingBlocks) { const SCEV *ExactExitCount = SE->getExitCount(L, ExitingBB); const SCEV *MaxExitCount = SE->getExitCount( diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 85ee824..a097d33 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -434,7 +434,7 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II, NewV = NewV->stripPointerCasts(); Function *NewDecl = Intrinsic::getOrInsertDeclaration( M, II->getIntrinsicID(), {NewV->getType()}); - II->setArgOperand(1, NewV); + II->setArgOperand(0, NewV); II->setCalledFunction(NewDecl); return true; } @@ -491,7 +491,7 @@ void InferAddressSpacesImpl::collectRewritableIntrinsicOperands( } case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: { - appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(1), + appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0), PostorderStack, Visited); break; } diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp index 0ddc231..e9bf59c 100644 --- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp +++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp @@ -58,14 +58,55 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) { } // Compute alignment from known bits. + auto InferFromKnownBits = [&](Instruction &I, Value *PtrOp) { + KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT); + unsigned TrailZ = + std::min(Known.countMinTrailingZeros(), +Value::MaxAlignmentExponent); + return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ)); + }; + + // Propagate alignment between loads and stores that originate from the + // same base pointer. + DenseMap<Value *, Align> BestBasePointerAligns; + auto InferFromBasePointer = [&](Value *PtrOp, Align LoadStoreAlign) { + APInt OffsetFromBase(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0); + PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase, true); + // Derive the base pointer alignment from the load/store alignment + // and the offset from the base pointer. + Align BasePointerAlign = + commonAlignment(LoadStoreAlign, OffsetFromBase.getLimitedValue()); + + auto [It, Inserted] = + BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign); + if (!Inserted) { + // If the stored base pointer alignment is better than the + // base pointer alignment we derived, we may be able to use it + // to improve the load/store alignment. If not, store the + // improved base pointer alignment for future iterations. + if (It->second > BasePointerAlign) { + Align BetterLoadStoreAlign = + commonAlignment(It->second, OffsetFromBase.getLimitedValue()); + return BetterLoadStoreAlign; + } + It->second = BasePointerAlign; + } + return LoadStoreAlign; + }; + for (BasicBlock &BB : F) { + // We need to reset the map for each block because alignment information + // can only be propagated from instruction A to B if A dominates B. + // This is because control flow (and exception throwing) could be dependent + // on the address (and its alignment) at runtime. Some sort of dominator + // tree approach could be better, but doing a simple forward pass through a + // single basic block is correct too. + BestBasePointerAligns.clear(); + for (Instruction &I : BB) { Changed |= tryToImproveAlign( DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) { - KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT); - unsigned TrailZ = std::min(Known.countMinTrailingZeros(), - +Value::MaxAlignmentExponent); - return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ)); + return std::max(InferFromKnownBits(I, PtrOp), + InferFromBasePointer(PtrOp, OldAlign)); }); } } diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp index 7f99cd2..9d915d0 100644 --- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp +++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp @@ -7,14 +7,23 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/JumpTableToSwitch.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/CtxProfAnalysis.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/ProfDataUtils.h" +#include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Error.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <limits> using namespace llvm; @@ -33,6 +42,8 @@ static cl::opt<unsigned> FunctionSizeThreshold( "or equal than this threshold."), cl::init(50)); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + #define DEBUG_TYPE "jump-table-to-switch" namespace { @@ -90,9 +101,11 @@ static std::optional<JumpTableTy> parseJumpTable(GetElementPtrInst *GEP, return JumpTable; } -static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT, - DomTreeUpdater &DTU, - OptimizationRemarkEmitter &ORE) { +static BasicBlock * +expandToSwitch(CallBase *CB, const JumpTableTy &JT, DomTreeUpdater &DTU, + OptimizationRemarkEmitter &ORE, + llvm::function_ref<GlobalValue::GUID(const Function &)> + GetGuidForFunction) { const bool IsVoid = CB->getType() == Type::getVoidTy(CB->getContext()); SmallVector<DominatorTree::UpdateType, 8> DTUpdates; @@ -115,7 +128,30 @@ static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT, IRBuilder<> BuilderTail(CB); PHINode *PHI = IsVoid ? nullptr : BuilderTail.CreatePHI(CB->getType(), JT.Funcs.size()); + const auto *ProfMD = CB->getMetadata(LLVMContext::MD_prof); + + SmallVector<uint64_t> BranchWeights; + DenseMap<GlobalValue::GUID, uint64_t> GuidToCounter; + const bool HadProfile = isValueProfileMD(ProfMD); + if (HadProfile) { + // The assumptions, coming in, are that the functions in JT.Funcs are + // defined in this module (from parseJumpTable). + assert(llvm::all_of( + JT.Funcs, [](const Function *F) { return F && !F->isDeclaration(); })); + BranchWeights.reserve(JT.Funcs.size() + 1); + // The first is the default target, which is the unreachable block created + // above. + BranchWeights.push_back(0U); + uint64_t TotalCount = 0; + auto Targets = getValueProfDataFromInst( + *CB, InstrProfValueKind::IPVK_IndirectCallTarget, + std::numeric_limits<uint32_t>::max(), TotalCount); + for (const auto &[G, C] : Targets) { + [[maybe_unused]] auto It = GuidToCounter.insert({G, C}); + assert(It.second); + } + } for (auto [Index, Func] : llvm::enumerate(JT.Funcs)) { BasicBlock *B = BasicBlock::Create(Func->getContext(), "call." + Twine(Index), &F, Tail); @@ -123,10 +159,19 @@ static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT, DTUpdates.push_back({DominatorTree::Insert, B, Tail}); CallBase *Call = cast<CallBase>(CB->clone()); + // The MD_prof metadata (VP kind), if it existed, can be dropped, it doesn't + // make sense on a direct call. Note that the values are used for the branch + // weights of the switch. + Call->setMetadata(LLVMContext::MD_prof, nullptr); Call->setCalledFunction(Func); Call->insertInto(B, B->end()); Switch->addCase( cast<ConstantInt>(ConstantInt::get(JT.Index->getType(), Index)), B); + GlobalValue::GUID FctID = GetGuidForFunction(*Func); + // It'd be OK to _not_ find target functions in GuidToCounter, e.g. suppose + // just some of the jump targets are taken (for the given profile). + BranchWeights.push_back(FctID == 0U ? 0U + : GuidToCounter.lookup_or(FctID, 0U)); BranchInst::Create(Tail, B); if (PHI) PHI->addIncoming(Call, B); @@ -136,6 +181,13 @@ static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT, return OptimizationRemark(DEBUG_TYPE, "ReplacedJumpTableWithSwitch", CB) << "expanded indirect call into switch"; }); + if (HadProfile && !ProfcheckDisableMetadataFixes) { + // At least one of the targets must've been taken. + assert(llvm::any_of(BranchWeights, [](uint64_t V) { return V != 0; })); + setBranchWeights(*Switch, downscaleWeights(BranchWeights), + /*IsExpected=*/false); + } else + setExplicitlyUnknownBranchWeights(*Switch); if (PHI) CB->replaceAllUsesWith(PHI); CB->eraseFromParent(); @@ -150,6 +202,15 @@ PreservedAnalyses JumpTableToSwitchPass::run(Function &F, PostDominatorTree *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F); DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy); bool Changed = false; + InstrProfSymtab Symtab; + if (auto E = Symtab.create(*F.getParent())) + F.getContext().emitError( + "Could not create indirect call table, likely corrupted IR" + + toString(std::move(E))); + DenseMap<const Function *, GlobalValue::GUID> FToGuid; + for (const auto &[G, FPtr] : Symtab.getIDToNameMap()) + FToGuid.insert({FPtr, G}); + for (BasicBlock &BB : make_early_inc_range(F)) { BasicBlock *CurrentBB = &BB; while (CurrentBB) { @@ -170,7 +231,12 @@ PreservedAnalyses JumpTableToSwitchPass::run(Function &F, std::optional<JumpTableTy> JumpTable = parseJumpTable(GEP, PtrTy); if (!JumpTable) continue; - SplittedOutTail = expandToSwitch(Call, *JumpTable, DTU, ORE); + SplittedOutTail = expandToSwitch( + Call, *JumpTable, DTU, ORE, [&](const Function &Fct) { + if (Fct.getMetadata(AssignGUIDPass::GUIDMetadataName)) + return AssignGUIDPass::getGUID(Fct); + return FToGuid.lookup_or(&Fct, 0U); + }); Changed = true; break; } diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index c3f80f9..e157cc9 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -169,6 +169,8 @@ cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap( "number of accesses allowed to be present in a loop in order to " "enable memory promotion.")); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); static bool isNotUsedOrFoldableInLoop(const Instruction &I, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, @@ -472,7 +474,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, if (Preheader) Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L, MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode, - LicmAllowSpeculation); + LicmAllowSpeculation, HasCoroSuspendInst); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -857,9 +859,18 @@ public: } // Now finally clone BI. - ReplaceInstWithInst( - HoistTarget->getTerminator(), - BranchInst::Create(HoistTrueDest, HoistFalseDest, BI->getCondition())); + auto *NewBI = + BranchInst::Create(HoistTrueDest, HoistFalseDest, BI->getCondition(), + HoistTarget->getTerminator()->getIterator()); + HoistTarget->getTerminator()->eraseFromParent(); + // md_prof should also come from the original branch - since the + // condition was hoisted, the branch probabilities shouldn't change. + if (!ProfcheckDisableMetadataFixes) + NewBI->copyMetadata(*BI, {LLVMContext::MD_prof}); + // FIXME: Issue #152767: debug info should also be the same as the + // original branch, **if** the user explicitly indicated that. + NewBI->setDebugLoc(HoistTarget->getTerminator()->getDebugLoc()); + ++NumClonedBranches; assert(CurLoop->getLoopPreheader() && @@ -881,7 +892,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE, bool LoopNestMode, - bool AllowSpeculation) { + bool AllowSpeculation, bool HasCoroSuspendInst) { // Verify inputs. assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && CurLoop != nullptr && SafetyInfo != nullptr && @@ -914,11 +925,11 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, // TODO: It may be safe to hoist if we are hoisting to a conditional block // and we have accurately duplicated the control flow from the loop header // to that block. - if (CurLoop->hasLoopInvariantOperands(&I) && + if (CurLoop->hasLoopInvariantOperands(&I, HasCoroSuspendInst) && canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) && - isSafeToExecuteUnconditionally( - I, DT, TLI, CurLoop, SafetyInfo, ORE, - Preheader->getTerminator(), AC, AllowSpeculation)) { + isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo, ORE, + Preheader->getTerminator(), AC, + AllowSpeculation)) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, MSSAU, SE, ORE); HoistedInstructions.push_back(&I); @@ -964,7 +975,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop); }; if ((IsInvariantStart(I) || isGuard(&I)) && - CurLoop->hasLoopInvariantOperands(&I) && + CurLoop->hasLoopInvariantOperands(&I, HasCoroSuspendInst) && MustExecuteWithoutWritesBefore(I)) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, MSSAU, SE, ORE); @@ -1230,11 +1241,16 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, if (Behavior.doesNotAccessMemory()) return true; if (Behavior.onlyReadsMemory()) { + // Might have stale MemoryDef for call that was later inferred to be + // read-only. + auto *MU = dyn_cast<MemoryUse>(MSSA->getMemoryAccess(CI)); + if (!MU) + return false; + // If we can prove there are no writes to the memory read by the call, we // can hoist or sink. return !pointerInvalidatedByLoop( - MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(CI)), CurLoop, I, Flags, - /*InvariantGroup=*/false); + MSSA, MU, CurLoop, I, Flags, /*InvariantGroup=*/false); } if (Behavior.onlyWritesMemory()) { @@ -1688,8 +1704,12 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning // time in isGuaranteedToExecute if we don't actually have anything to // drop. It is a compile time optimization, not required for correctness. - !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) - I.dropUBImplyingAttrsAndMetadata(); + !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) { + if (ProfcheckDisableMetadataFixes) + I.dropUBImplyingAttrsAndMetadata(); + else + I.dropUBImplyingAttrsAndMetadata({LLVMContext::MD_prof}); + } if (isa<PHINode>(I)) // Move the new node to the end of the phi list in the destination block. @@ -2856,7 +2876,7 @@ static bool hoistBOAssociation(Instruction &I, Loop &L, bool LVInRHS = L.isLoopInvariant(BO->getOperand(0)); auto *BO0 = dyn_cast<BinaryOperator>(BO->getOperand(LVInRHS)); if (!BO0 || BO0->getOpcode() != Opcode || !BO0->isAssociative() || - BO0->hasNUsesOrMore(3)) + BO0->hasNUsesOrMore(BO0->getType()->isIntegerTy() ? 2 : 3)) return false; Value *LV = BO0->getOperand(0); diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 0ac1a15..27d3004 100644 --- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -502,8 +502,10 @@ public: SmallVector<int, 8> PtrToPartitions(N); for (unsigned I = 0; I < N; ++I) { Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; - auto Instructions = - LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); + auto Instructions = LAI.getInstructionsForAccess(Ptr, /* IsWrite */ true); + auto ReadInstructions = + LAI.getInstructionsForAccess(Ptr, /* IsWrite */ false); + Instructions.append(ReadInstructions.begin(), ReadInstructions.end()); int &Partition = PtrToPartitions[I]; // First set it to uninitialized. diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index b3bffeb..5795c76 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -263,6 +263,7 @@ static bool isUniformShape(Value *V) { case llvm::Instruction::FPExt: return true; case llvm::Instruction::AddrSpaceCast: + case CastInst::PtrToAddr: case CastInst::PtrToInt: case CastInst::IntToPtr: return false; @@ -1208,7 +1209,7 @@ public: // // For verification, we keep track of where we changed uses to poison in // PoisonedInsts and then check that we in fact remove them. - SmallSet<Instruction *, 16> PoisonedInsts; + SmallPtrSet<Instruction *, 16> PoisonedInsts; for (auto *Inst : reverse(ToRemove)) { for (Use &U : llvm::make_early_inc_range(Inst->uses())) { if (auto *Poisoned = dyn_cast<Instruction>(U.getUser())) @@ -2166,7 +2167,7 @@ public: // If the loads don't alias the lifetime.end, it won't interfere with // fusion. - MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 1, nullptr); + MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 0, nullptr); if (!EndLoc.Ptr) continue; if (AA->isNoAlias(Load0Loc, EndLoc) && AA->isNoAlias(Load1Loc, EndLoc)) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 79721dc..e043d07 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -915,7 +915,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, // move the bitcast as well, which we don't handle. if (SkippedLifetimeStart) { auto *LifetimeArg = - dyn_cast<Instruction>(SkippedLifetimeStart->getOperand(1)); + dyn_cast<Instruction>(SkippedLifetimeStart->getOperand(0)); if (LifetimeArg && LifetimeArg->getParent() == C->getParent() && C->comesBefore(LifetimeArg)) return false; @@ -1010,7 +1010,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, // Lifetime of srcAlloca ends at lifetime.end. if (auto *II = dyn_cast<IntrinsicInst>(&I)) { if (II->getIntrinsicID() == Intrinsic::lifetime_end && - II->getArgOperand(1) == srcAlloca) + II->getArgOperand(0) == srcAlloca) break; } @@ -1393,7 +1393,7 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V, if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) if (II->getIntrinsicID() == Intrinsic::lifetime_start) if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) - return II->getArgOperand(1) == Alloca; + return II->getArgOperand(0) == Alloca; return false; } @@ -1530,7 +1530,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, // to remove them. SmallVector<Instruction *, 4> LifetimeMarkers; - SmallSet<Instruction *, 4> AAMetadataInstrs; + SmallPtrSet<Instruction *, 4> AAMetadataInstrs; bool SrcNotDom = false; auto CaptureTrackingWithModRef = @@ -1540,7 +1540,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, Worklist.push_back(AI); unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking(); Worklist.reserve(MaxUsesToExplore); - SmallSet<const Use *, 20> Visited; + SmallPtrSet<const Use *, 20> Visited; while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); for (const Use &U : I->uses()) { diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 1a52af1..9d4fb79 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -651,7 +651,7 @@ class NewGVN { BitVector TouchedInstructions; DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange; - mutable DenseMap<const IntrinsicInst *, const Value *> PredicateSwapChoice; + mutable DenseMap<const BitCastInst *, const Value *> PredicateSwapChoice; #ifndef NDEBUG // Debugging for how many times each block and instruction got processed. @@ -819,7 +819,7 @@ private: BasicBlock *PHIBlock) const; const Expression *performSymbolicAggrValueEvaluation(Instruction *) const; ExprResult performSymbolicCmpEvaluation(Instruction *) const; - ExprResult performSymbolicPredicateInfoEvaluation(IntrinsicInst *) const; + ExprResult performSymbolicPredicateInfoEvaluation(BitCastInst *) const; // Congruence finding. bool someEquivalentDominates(const Instruction *, const Instruction *) const; @@ -841,7 +841,7 @@ private: unsigned int getRank(const Value *) const; bool shouldSwapOperands(const Value *, const Value *) const; bool shouldSwapOperandsForPredicate(const Value *, const Value *, - const IntrinsicInst *I) const; + const BitCastInst *I) const; // Reachability handling. void updateReachableEdge(BasicBlock *, BasicBlock *); @@ -1013,9 +1013,9 @@ void NewGVN::deleteExpression(const Expression *E) const { // If V is a predicateinfo copy, get the thing it is a copy of. static Value *getCopyOf(const Value *V) { - if (auto *II = dyn_cast<IntrinsicInst>(V)) - if (II->getIntrinsicID() == Intrinsic::ssa_copy) - return II->getOperand(0); + if (auto *BC = dyn_cast<BitCastInst>(V)) + if (BC->getType() == BC->getOperand(0)->getType()) + return BC->getOperand(0); return nullptr; } @@ -1535,7 +1535,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start) { - auto *LifetimePtr = II->getOperand(1); + auto *LifetimePtr = II->getOperand(0); if (LoadPtr == lookupOperandLeader(LifetimePtr) || AA->isMustAlias(LoadPtr, LifetimePtr)) return createConstantExpression(UndefValue::get(LoadType)); @@ -1604,7 +1604,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const { } NewGVN::ExprResult -NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const { +NewGVN::performSymbolicPredicateInfoEvaluation(BitCastInst *I) const { auto *PI = PredInfo->getPredicateInfoFor(I); if (!PI) return ExprResult::none(); @@ -1647,13 +1647,8 @@ NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const { NewGVN::ExprResult NewGVN::performSymbolicCallEvaluation(Instruction *I) const { auto *CI = cast<CallInst>(I); if (auto *II = dyn_cast<IntrinsicInst>(I)) { - // Intrinsics with the returned attribute are copies of arguments. - if (auto *ReturnedValue = II->getReturnedArgOperand()) { - if (II->getIntrinsicID() == Intrinsic::ssa_copy) - if (auto Res = performSymbolicPredicateInfoEvaluation(II)) - return Res; + if (auto *ReturnedValue = II->getReturnedArgOperand()) return ExprResult::some(createVariableOrConstant(ReturnedValue)); - } } // FIXME: Currently the calls which may access the thread id may @@ -2032,6 +2027,12 @@ NewGVN::performSymbolicEvaluation(Instruction *I, E = performSymbolicLoadEvaluation(I); break; case Instruction::BitCast: + // Intrinsics with the returned attribute are copies of arguments. + if (I->getType() == I->getOperand(0)->getType()) + if (auto Res = + performSymbolicPredicateInfoEvaluation(cast<BitCastInst>(I))) + return Res; + [[fallthrough]]; case Instruction::AddrSpaceCast: case Instruction::Freeze: return createExpression(I); @@ -4075,8 +4076,7 @@ bool NewGVN::eliminateInstructions(Function &F) { if (DominatingLeader != Def) { // Even if the instruction is removed, we still need to update // flags/metadata due to downstreams users of the leader. - if (!match(DefI, m_Intrinsic<Intrinsic::ssa_copy>())) - patchReplacementInstruction(DefI, DominatingLeader); + patchReplacementInstruction(DefI, DominatingLeader); SmallVector<DbgVariableRecord *> DVRUsers; findDbgUsers(DefI, DVRUsers); @@ -4116,10 +4116,14 @@ bool NewGVN::eliminateInstructions(Function &F) { Value *DominatingLeader = EliminationStack.back(); - auto *II = dyn_cast<IntrinsicInst>(DominatingLeader); - bool isSSACopy = II && II->getIntrinsicID() == Intrinsic::ssa_copy; - if (isSSACopy) - DominatingLeader = II->getOperand(0); + Instruction *SSACopy = nullptr; + if (auto *BC = dyn_cast<BitCastInst>(DominatingLeader)) { + if (BC->getType() == BC->getOperand(0)->getType() && + PredInfo->getPredicateInfoFor(DominatingLeader)) { + SSACopy = BC; + DominatingLeader = BC->getOperand(0); + } + } // Don't replace our existing users with ourselves. if (U->get() == DominatingLeader) @@ -4145,12 +4149,12 @@ bool NewGVN::eliminateInstructions(Function &F) { ProbablyDead.erase(cast<Instruction>(DominatingLeader)); // For copy instructions, we use their operand as a leader, // which means we remove a user of the copy and it may become dead. - if (isSSACopy) { - auto It = UseCounts.find(II); + if (SSACopy) { + auto It = UseCounts.find(SSACopy); if (It != UseCounts.end()) { unsigned &IIUseCount = It->second; if (--IIUseCount == 0) - ProbablyDead.insert(II); + ProbablyDead.insert(SSACopy); } } ++LeaderUseCount; @@ -4251,7 +4255,7 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const { } bool NewGVN::shouldSwapOperandsForPredicate(const Value *A, const Value *B, - const IntrinsicInst *I) const { + const BitCastInst *I) const { if (shouldSwapOperands(A, B)) { PredicateSwapChoice[I] = B; return true; diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index 343da5b2..ba58b8e 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -878,7 +878,7 @@ static Value *NegateValue(Value *V, Instruction *BI, // only that it mostly looks like one. static bool isLoadCombineCandidate(Instruction *Or) { SmallVector<Instruction *, 8> Worklist; - SmallSet<Instruction *, 8> Visited; + SmallPtrSet<Instruction *, 8> Visited; auto Enqueue = [&](Value *V) { auto *I = dyn_cast<Instruction>(V); diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index d9805d8..8b15445 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -2309,8 +2309,9 @@ chainToBasePointerCost(SmallVectorImpl<Instruction *> &Chain, } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { // Cost of the address calculation - Type *ValTy = GEP->getSourceElementType(); - Cost += TTI.getAddressComputationCost(ValTy); + Cost += TTI.getAddressComputationCost( + GEP->getType(), nullptr, nullptr, + TargetTransformInfo::TCK_SizeAndLatency); // And cost of the GEP itself // TODO: Use TTI->getGEPCost here (it exists, but appears to be not diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index 8be2f78..feee794 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -31,6 +32,7 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" @@ -53,12 +55,15 @@ STATISTIC(NumInstReplaced, // runSCCP() - Run the Sparse Conditional Constant Propagation algorithm, // and return true if the function was modified. static bool runSCCP(Function &F, const DataLayout &DL, - const TargetLibraryInfo *TLI, DomTreeUpdater &DTU) { + const TargetLibraryInfo *TLI, DominatorTree &DT, + AssumptionCache &AC) { LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); SCCPSolver Solver( DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; }, F.getContext()); + Solver.addPredicateInfo(F, DT, AC); + // While we don't do any actual inter-procedural analysis, still track // return values so we can infer attributes. if (canTrackReturnsInterprocedurally(&F)) @@ -101,6 +106,7 @@ static bool runSCCP(Function &F, const DataLayout &DL, } // Remove unreachable blocks and non-feasible edges. + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); for (BasicBlock *DeadBB : BlocksToErase) NumInstRemoved += changeToUnreachable(&*DeadBB->getFirstNonPHIIt(), /*PreserveLCSSA=*/false, &DTU); @@ -113,6 +119,8 @@ static bool runSCCP(Function &F, const DataLayout &DL, if (!DeadBB->hasAddressTaken()) DTU.deleteBB(DeadBB); + Solver.removeSSACopies(F); + Solver.inferReturnAttributes(); return MadeChanges; @@ -121,9 +129,9 @@ static bool runSCCP(Function &F, const DataLayout &DL, PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) { const DataLayout &DL = F.getDataLayout(); auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); - auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - if (!runSCCP(F, DL, &TLI, DTU)) + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &AC = AM.getResult<AssumptionAnalysis>(F); + if (!runSCCP(F, DL, &TLI, DT, AC)) return PreservedAnalyses::all(); auto PA = PreservedAnalyses(); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 03d9f32..06a92bd 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -320,15 +320,6 @@ static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) { DVR->getDebugLoc().getInlinedAt()); } -DbgVariableRecord *UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord *Unused) { - (void)Unused; - return static_cast<DbgVariableRecord *>(cast<DbgRecord *>(P)); -} -DbgAssignIntrinsic *UnwrapDbgInstPtr(DbgInstPtr P, DbgAssignIntrinsic *Unused) { - (void)Unused; - return static_cast<DbgAssignIntrinsic *>(cast<Instruction *>(P)); -} - /// Find linked dbg.assign and generate a new one with the correct /// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the /// value component is copied from the old dbg.assign to the new. @@ -348,10 +339,9 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL) { - auto MarkerRange = at::getAssignmentMarkers(OldInst); auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst); // Nothing to do if OldInst has no linked dbg.assign intrinsics. - if (MarkerRange.empty() && DVRAssignMarkerRange.empty()) + if (DVRAssignMarkerRange.empty()) return; LLVM_DEBUG(dbgs() << " migrateDebugInfo\n"); @@ -435,11 +425,10 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, } ::Value *NewValue = Value ? Value : DbgAssign->getValue(); - auto *NewAssign = UnwrapDbgInstPtr( + DbgVariableRecord *NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>( DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr, Dest, DIExpression::get(Expr->getContext(), {}), - DbgAssign->getDebugLoc()), - DbgAssign); + DbgAssign->getDebugLoc()))); // If we've updated the value but the original dbg.assign has an arglist // then kill it now - we can't use the requested new value. @@ -1260,10 +1249,7 @@ private: return PI.setAborted(&II); if (II.isLifetimeStartOrEnd()) { - ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); - uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), - Length->getLimitedValue()); - insertUse(II, Offset, Size, true); + insertUse(II, Offset, AllocSize, true); return; } @@ -3235,8 +3221,7 @@ private: // In theory we should call migrateDebugInfo here. However, we do not // emit dbg.assign intrinsics for mem intrinsics storing through non- // constant geps, or storing a variable number of bytes. - assert(at::getAssignmentMarkers(&II).empty() && - at::getDVRAssignmentMarkers(&II).empty() && + assert(at::getDVRAssignmentMarkers(&II).empty() && "AT: Unexpected link to non-const GEP"); deleteIfTriviallyDead(OldPtr); return false; @@ -3385,13 +3370,11 @@ private: Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); if (IsDest) { // Update the address component of linked dbg.assigns. - auto UpdateAssignAddress = [&](auto *DbgAssign) { + for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(&II)) { if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) || DbgAssign->getAddress() == II.getDest()) DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr); - }; - for_each(at::getAssignmentMarkers(&II), UpdateAssignAddress); - for_each(at::getDVRAssignmentMarkers(&II), UpdateAssignAddress); + } II.setDest(AdjustedPtr); II.setDestAlignment(SliceAlign); } else { @@ -3614,30 +3597,14 @@ private: return true; } - assert(II.getArgOperand(1) == OldPtr); - // Lifetime intrinsics are only promotable if they cover the whole alloca. - // Therefore, we drop lifetime intrinsics which don't cover the whole - // alloca. - // (In theory, intrinsics which partially cover an alloca could be - // promoted, but PromoteMemToReg doesn't handle that case.) - // FIXME: Check whether the alloca is promotable before dropping the - // lifetime intrinsics? - if (NewBeginOffset != NewAllocaBeginOffset || - NewEndOffset != NewAllocaEndOffset) - return true; - - ConstantInt *Size = - ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), - NewEndOffset - NewBeginOffset); - // Lifetime intrinsics always expect an i8* so directly get such a pointer - // for the new alloca slice. + assert(II.getArgOperand(0) == OldPtr); Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace()); Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy); Value *New; if (II.getIntrinsicID() == Intrinsic::lifetime_start) - New = IRB.CreateLifetimeStart(Ptr, Size); + New = IRB.CreateLifetimeStart(Ptr); else - New = IRB.CreateLifetimeEnd(Ptr, Size); + New = IRB.CreateLifetimeEnd(Ptr); (void)New; LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); @@ -4005,8 +3972,7 @@ private: Store->getPointerOperand(), Store->getValueOperand(), DL); } else { - assert(at::getAssignmentMarkers(Store).empty() && - at::getDVRAssignmentMarkers(Store).empty() && + assert(at::getDVRAssignmentMarkers(Store).empty() && "AT: unexpected debug.assign linked to store through " "unbounded GEP"); } diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp index c7e4a3e..032a3a7 100644 --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -37,6 +37,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeMergeICmpsLegacyPassPass(Registry); initializeNaryReassociateLegacyPassPass(Registry); initializePartiallyInlineLibCallsLegacyPassPass(Registry); + initializeDSELegacyPassPass(Registry); initializeReassociateLegacyPassPass(Registry); initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry); initializeSROALegacyPassPass(Registry); diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 6ffe841..fc96589 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -294,6 +294,10 @@ private: bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, bool NonNegative); + /// Analyze XOR instruction to extract disjoint constant bits that behave + /// like addition operations for improved address mode folding. + APInt extractDisjointBitsFromXor(BinaryOperator *XorInst); + /// The path from the constant offset to the old GEP index. e.g., if the GEP /// index is "a * b + (c + 5)". After running function find, UserChain[0] will /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and @@ -596,6 +600,9 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, // Trace into subexpressions for more hoisting opportunities. if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); + // Handle XOR with disjoint bits that can be treated as addition. + else if (BO->getOpcode() == Instruction::Xor) + ConstantOffset = extractDisjointBitsFromXor(BO); } else if (isa<TruncInst>(V)) { ConstantOffset = find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative) @@ -708,11 +715,20 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { Value *NextInChain = removeConstOffset(ChainIndex - 1); Value *TheOther = BO->getOperand(1 - OpNo); - // If NextInChain is 0 and not the LHS of a sub, we can simplify the - // sub-expression to be just TheOther. if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) { - if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0)) - return TheOther; + if (CI->isZero()) { + // Custom XOR handling for disjoint bits - preserves original XOR + // with non-disjoint constant bits. + // TODO: The design should be updated to support partial constant + // extraction. + if (BO->getOpcode() == Instruction::Xor) + return BO; + + // If NextInChain is 0 and not the LHS of a sub, we can simplify the + // sub-expression to be just TheOther. + if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0)) + return TheOther; + } } BinaryOperator::BinaryOps NewOp = BO->getOpcode(); @@ -743,6 +759,67 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { return NewBO; } +/// Analyze XOR instruction to extract disjoint constant bits for address +/// folding +/// +/// This function identifies bits in an XOR constant operand that are disjoint +/// from the base operand's known set bits. For these disjoint bits, XOR behaves +/// identically to addition, allowing us to extract them as constant offsets +/// that can be folded into addressing modes. +/// +/// Transformation: `Base ^ Const` becomes `(Base ^ NonDisjointBits) + +/// DisjointBits` where DisjointBits = Const & KnownZeros(Base) +/// +/// Example with ptr having known-zero low bit: +/// Original: `xor %ptr, 3` ; 3 = 0b11 +/// Analysis: DisjointBits = 3 & KnownZeros(%ptr) = 0b11 & 0b01 = 0b01 +/// Result: `(xor %ptr, 2) + 1` where 1 can be folded into address mode +/// +/// \param XorInst The XOR binary operator to analyze +/// \return APInt containing the disjoint bits that can be extracted as offset, +/// or zero if no disjoint bits exist +APInt ConstantOffsetExtractor::extractDisjointBitsFromXor( + BinaryOperator *XorInst) { + assert(XorInst && XorInst->getOpcode() == Instruction::Xor && + "Expected XOR instruction"); + + const unsigned BitWidth = XorInst->getType()->getScalarSizeInBits(); + Value *BaseOperand; + ConstantInt *XorConstant; + + // Match pattern: xor BaseOperand, Constant. + if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant)))) + return APInt::getZero(BitWidth); + + // Compute known bits for the base operand. + const SimplifyQuery SQ(DL); + const KnownBits BaseKnownBits = computeKnownBits(BaseOperand, SQ); + const APInt &ConstantValue = XorConstant->getValue(); + + // Identify disjoint bits: constant bits that are known zero in base. + const APInt DisjointBits = ConstantValue & BaseKnownBits.Zero; + + // Early exit if no disjoint bits found. + if (DisjointBits.isZero()) + return APInt::getZero(BitWidth); + + // Compute the remaining non-disjoint bits that stay in the XOR. + const APInt NonDisjointBits = ConstantValue & ~DisjointBits; + + // FIXME: Enhance XOR constant extraction to handle nested binary operations. + // Currently we only extract disjoint bits from the immediate XOR constant, + // but we could recursively process cases like: + // xor (add %base, C1), C2 -> add %base, (C1 ^ disjoint_bits(C2)) + // This requires careful analysis to ensure the transformation preserves + // semantics, particularly around sign extension and overflow behavior. + + // Add the non-disjoint constant to the user chain for later transformation + // This will replace the original constant in the XOR with the new + // constant. + UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits)); + return DisjointBits; +} + /// A helper function to check if reassociating through an entry in the user /// chain would invalidate the GEP's nuw flag. static bool allowsPreservingNUW(const User *U) { diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index f6959ca2..9b40fc0 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -2144,23 +2144,9 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) { void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName, bool CurrentLoopValid, bool PartiallyInvariant, bool InjectedCondition, ArrayRef<Loop *> NewLoops) { - auto RecordLoopAsUnswitched = [&](Loop *TargetLoop, StringRef Tag, - StringRef DisableTag) { - auto &Ctx = TargetLoop->getHeader()->getContext(); - MDNode *DisableMD = MDNode::get(Ctx, MDString::get(Ctx, DisableTag)); - MDNode *NewLoopID = makePostTransformationMetadata( - Ctx, TargetLoop->getLoopID(), {Tag}, {DisableMD}); - TargetLoop->setLoopID(NewLoopID); - }; - - // If we performed a non-trivial unswitch, we have added new cloned loops. - // Mark such newly-created loops as visited. - if (!NewLoops.empty()) { - for (Loop *NL : NewLoops) - RecordLoopAsUnswitched(NL, "llvm.loop.unswitch.nontrivial", - "llvm.loop.unswitch.nontrivial.disable"); + // If we did a non-trivial unswitch, we have added new (cloned) loops. + if (!NewLoops.empty()) U.addSiblingLoops(NewLoops); - } // If the current loop remains valid, we should revisit it to catch any // other unswitch opportunities. Otherwise, we need to mark it as deleted. @@ -2168,12 +2154,24 @@ void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName, if (PartiallyInvariant) { // Mark the new loop as partially unswitched, to avoid unswitching on // the same condition again. - RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.partial", - "llvm.loop.unswitch.partial.disable"); + auto &Context = L.getHeader()->getContext(); + MDNode *DisableUnswitchMD = MDNode::get( + Context, + MDString::get(Context, "llvm.loop.unswitch.partial.disable")); + MDNode *NewLoopID = makePostTransformationMetadata( + Context, L.getLoopID(), {"llvm.loop.unswitch.partial"}, + {DisableUnswitchMD}); + L.setLoopID(NewLoopID); } else if (InjectedCondition) { // Do the same for injection of invariant conditions. - RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.injection", - "llvm.loop.unswitch.injection.disable"); + auto &Context = L.getHeader()->getContext(); + MDNode *DisableUnswitchMD = MDNode::get( + Context, + MDString::get(Context, "llvm.loop.unswitch.injection.disable")); + MDNode *NewLoopID = makePostTransformationMetadata( + Context, L.getLoopID(), {"llvm.loop.unswitch.injection"}, + {DisableUnswitchMD}); + L.setLoopID(NewLoopID); } else U.revisitCurrentLoop(); } else @@ -2811,9 +2809,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, } /// Cost multiplier is a way to limit potentially exponential behavior -/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch -/// candidates available. Also consider the number of "sibling" loops with -/// the idea of accounting for previous unswitches that already happened on this +/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch +/// candidates available. Also accounting for the number of "sibling" loops with +/// the idea to account for previous unswitches that already happened on this /// cluster of loops. There was an attempt to keep this formula simple, /// just enough to limit the worst case behavior. Even if it is not that simple /// now it is still not an attempt to provide a detailed heuristic size @@ -3509,9 +3507,8 @@ static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates; IVConditionInfo PartialIVInfo; Instruction *PartialIVCondBranch = nullptr; - if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.nontrivial.disable")) - collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, - PartialIVCondBranch, L, LI, AA, MSSAU); + collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, + PartialIVCondBranch, L, LI, AA, MSSAU); if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.injection.disable")) collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo, PartialIVCondBranch, L, DT, LI, AA, diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 44e63a0..b17dcb78 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -328,7 +328,7 @@ class StructurizeCFG { void addPhiValues(BasicBlock *From, BasicBlock *To); void findUndefBlocks(BasicBlock *PHIBlock, - const SmallSet<BasicBlock *, 8> &Incomings, + const SmallPtrSet<BasicBlock *, 8> &Incomings, SmallVector<BasicBlock *> &UndefBlks) const; void mergeIfCompatible(EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A, @@ -762,7 +762,7 @@ void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { /// from some blocks as undefined. The function will find out all such blocks /// and return in \p UndefBlks. void StructurizeCFG::findUndefBlocks( - BasicBlock *PHIBlock, const SmallSet<BasicBlock *, 8> &Incomings, + BasicBlock *PHIBlock, const SmallPtrSet<BasicBlock *, 8> &Incomings, SmallVector<BasicBlock *> &UndefBlks) const { // We may get a post-structured CFG like below: // @@ -788,7 +788,7 @@ void StructurizeCFG::findUndefBlocks( // path N->F2->F3->B. For example, the threads take the branch F1->N may // always take the branch F2->P2. So, when we are reconstructing a PHI // originally in B, we can safely say the incoming value from N is undefined. - SmallSet<BasicBlock *, 8> VisitedBlock; + SmallPtrSet<BasicBlock *, 8> VisitedBlock; SmallVector<BasicBlock *, 8> Stack; if (PHIBlock == ParentRegion->getExit()) { for (auto P : predecessors(PHIBlock)) { @@ -884,7 +884,7 @@ void StructurizeCFG::setPhiValues() { PhiMap &BlkPhis = OldPhiIt->second; SmallVector<BasicBlock *> &UndefBlks = UndefBlksMap[To]; - SmallSet<BasicBlock *, 8> Incomings; + SmallPtrSet<BasicBlock *, 8> Incomings; // Get the undefined blocks shared by all the phi nodes. if (!BlkPhis.empty()) { diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index ddd203f3..42b1fdf 100644 --- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -111,15 +111,14 @@ BasicBlock * llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options, const Twine &BBName) { - assert(!isa<IndirectBrInst>(TI) && - "Cannot split critical edge from IndirectBrInst"); - BasicBlock *TIBB = TI->getParent(); BasicBlock *DestBB = TI->getSuccessor(SuccNum); - // Splitting the critical edge to a pad block is non-trivial. Don't do - // it in this generic function. - if (DestBB->isEHPad()) return nullptr; + // Splitting the critical edge to a pad block is non-trivial. + // And we cannot split block with IndirectBr as a terminator. + // Don't do it in this generic function. + if (DestBB->isEHPad() || isa<IndirectBrInst>(TI)) + return nullptr; if (Options.IgnoreUnreachableDests && isa<UnreachableInst>(DestBB->getFirstNonPHIOrDbgOrLifetime())) diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp index 40010ae..8044f61 100644 --- a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp +++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp @@ -193,7 +193,7 @@ bool CanonicalizeFreezeInLoopsImpl::run() { if (Candidates.empty()) return false; - SmallSet<PHINode *, 8> ProcessedPHIs; + SmallPtrSet<PHINode *, 8> ProcessedPHIs; for (const auto &Info : Candidates) { PHINode *PHI = Info.PHI; if (!ProcessedPHIs.insert(Info.PHI).second) diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 7a9dd37..bbd1ed6 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -1099,7 +1099,7 @@ static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks, // Get the memory operand of the lifetime marker. If the underlying // object is a sunk alloca, or is otherwise defined in the extraction // region, the lifetime marker must not be erased. - Value *Mem = II->getOperand(1)->stripInBoundsOffsets(); + Value *Mem = II->getOperand(0); if (SunkAllocas.count(Mem) || definedInRegion(Blocks, Mem)) continue; @@ -1115,8 +1115,6 @@ static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks, static void insertLifetimeMarkersSurroundingCall( Module *M, ArrayRef<Value *> LifetimesStart, ArrayRef<Value *> LifetimesEnd, CallInst *TheCall) { - LLVMContext &Ctx = M->getContext(); - auto NegativeOne = ConstantInt::getSigned(Type::getInt64Ty(Ctx), -1); Instruction *Term = TheCall->getParent()->getTerminator(); // Emit lifetime markers for the pointers given in \p Objects. Insert the @@ -1130,7 +1128,7 @@ static void insertLifetimeMarkersSurroundingCall( Function *Func = Intrinsic::getOrInsertDeclaration(M, MarkerFunc, Mem->getType()); - auto Marker = CallInst::Create(Func, {NegativeOne, Mem}); + auto Marker = CallInst::Create(Func, Mem); if (InsertBefore) Marker->insertBefore(TheCall->getIterator()); else diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp index 4b0065d..8954de6 100644 --- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp +++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp @@ -276,7 +276,7 @@ std::pair<BasicBlock *, bool> ControlFlowHub::finalize( DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks, const StringRef Prefix, std::optional<unsigned> MaxControlFlowBooleans) { #ifndef NDEBUG - SmallSet<BasicBlock *, 8> Incoming; + SmallPtrSet<BasicBlock *, 8> Incoming; #endif SetVector<BasicBlock *> Outgoing; diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index 291e2a5..7063cde 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -706,6 +706,15 @@ bool llvm::checkDebugInfoMetadata(Module &M, DILocsBefore, DILocsAfter, InstToDelete, NameOfWrappedPass, FileNameFromCU, ShouldWriteIntoJSON, Bugs); +#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE + // If we are tracking DebugLoc coverage, replace each empty DebugLoc with an + // annotated location now so that it does not show up in future passes even if + // it is propagated to other instructions. + for (auto &L : DILocsAfter) + if (!L.second) + const_cast<Instruction *>(L.first)->setDebugLoc(DebugLoc::getUnknown()); +#endif + bool ResultForVars = checkVars(DIVarsBefore, DIVarsAfter, NameOfWrappedPass, FileNameFromCU, ShouldWriteIntoJSON, Bugs); diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp index 540039b..0642d51 100644 --- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp +++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp @@ -30,7 +30,7 @@ PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M, FunctionType *FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); - const char *FuncName = RTLCI.getLibcallImplName(Impl); + StringRef FuncName = RTLCI.getLibcallImplName(Impl); M.getOrInsertFunction(FuncName, FuncTy); } diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 59a47a9..f49fbf8 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -77,7 +77,6 @@ #include <cstdint> #include <deque> #include <iterator> -#include <limits> #include <optional> #include <string> #include <utility> @@ -3004,31 +3003,11 @@ void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI, if (hasLifetimeMarkers(AI)) continue; - // Try to determine the size of the allocation. - ConstantInt *AllocaSize = nullptr; - if (ConstantInt *AIArraySize = - dyn_cast<ConstantInt>(AI->getArraySize())) { - auto &DL = Caller->getDataLayout(); - Type *AllocaType = AI->getAllocatedType(); - TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType); - uint64_t AllocaArraySize = AIArraySize->getLimitedValue(); - - // Don't add markers for zero-sized allocas. - if (AllocaArraySize == 0) - continue; - - // Check that array size doesn't saturate uint64_t and doesn't - // overflow when it's multiplied by type size. - if (!AllocaTypeSize.isScalable() && - AllocaArraySize != std::numeric_limits<uint64_t>::max() && - std::numeric_limits<uint64_t>::max() / AllocaArraySize >= - AllocaTypeSize.getFixedValue()) { - AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()), - AllocaArraySize * AllocaTypeSize); - } - } + std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout()); + if (Size && Size->isZero()) + continue; - builder.CreateLifetimeStart(AI, AllocaSize); + builder.CreateLifetimeStart(AI); for (ReturnInst *RI : Returns) { // Don't insert llvm.lifetime.end calls between a musttail or deoptimize // call and a return. The return kills all local allocas. @@ -3038,7 +3017,7 @@ void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI, if (InlinedDeoptimizeCalls && RI->getParent()->getTerminatingDeoptimizeCall()) continue; - IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize); + IRBuilder<>(RI).CreateLifetimeEnd(AI); } } } diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 2619e73..ac34490 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -275,7 +275,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, Builder.CreateBr(TheOnlyDest); BasicBlock *BB = SI->getParent(); - SmallSet<BasicBlock *, 8> RemovedSuccessors; + SmallPtrSet<BasicBlock *, 8> RemovedSuccessors; // Remove entries from PHI nodes which we no longer branch to... BasicBlock *SuccToKeep = TheOnlyDest; @@ -343,7 +343,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, if (auto *BA = dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) { BasicBlock *TheOnlyDest = BA->getBasicBlock(); - SmallSet<BasicBlock *, 8> RemovedSuccessors; + SmallPtrSet<BasicBlock *, 8> RemovedSuccessors; // Insert the new branch. Builder.CreateBr(TheOnlyDest); @@ -481,7 +481,7 @@ bool llvm::wouldInstructionBeTriviallyDead(const Instruction *I, return true; if (II->isLifetimeStartOrEnd()) { - auto *Arg = II->getArgOperand(1); + auto *Arg = II->getArgOperand(0); if (isa<PoisonValue>(Arg)) return true; @@ -2518,7 +2518,7 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool PreserveLCSSA, if (MSSAU) MSSAU->changeToUnreachable(I); - SmallSet<BasicBlock *, 8> UniqueSuccessors; + SmallPtrSet<BasicBlock *, 8> UniqueSuccessors; // Loop over all of the successors, removing BB's entry from any PHI // nodes. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index e7623aa..2d830f3 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -914,6 +914,8 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) { switch (RK) { default: llvm_unreachable("Unexpected recurrence kind"); + case RecurKind::AddChainWithSubs: + case RecurKind::Sub: case RecurKind::Add: return Intrinsic::vector_reduce_add; case RecurKind::Mul: @@ -1301,6 +1303,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, Builder.getFastMathFlags()); }; switch (RdxKind) { + case RecurKind::AddChainWithSubs: + case RecurKind::Sub: case RecurKind::Add: case RecurKind::Mul: case RecurKind::And: diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp index 472c03f..1f59b17 100644 --- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp +++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp @@ -155,7 +155,7 @@ void StackInfoBuilder::visit(OptimizationRemarkEmitter &ORE, return; } if (auto *II = dyn_cast<LifetimeIntrinsic>(&Inst)) { - AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(1)); + AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(0)); if (!AI || getAllocaInterestingness(*AI) != AllocaInterestingness::kInteresting) return; diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index b22ecbc..978d5a2 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -20,7 +20,6 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -291,6 +290,11 @@ void PredicateInfoBuilder::convertUsesToDFSOrdered( Value *Op, SmallVectorImpl<ValueDFS> &DFSOrderedSet) { for (auto &U : Op->uses()) { if (auto *I = dyn_cast<Instruction>(U.getUser())) { + // Lifetime intrinsics must work directly on alloca, do not replace them + // with a predicated copy. + if (I->isLifetimeStartOrEnd()) + continue; + ValueDFS VD; // Put the phi node uses in the incoming block. BasicBlock *IBlock; @@ -370,6 +374,8 @@ void PredicateInfoBuilder::processAssume( Values.push_back(Cond); if (auto *Cmp = dyn_cast<CmpInst>(Cond)) collectCmpOps(Cmp, Values); + else if (match(Cond, m_NUWTrunc(m_Value(Op0)))) + Values.push_back(Op0); for (Value *V : Values) { if (shouldRename(V)) { @@ -416,6 +422,8 @@ void PredicateInfoBuilder::processBranch( Values.push_back(Cond); if (auto *Cmp = dyn_cast<CmpInst>(Cond)) collectCmpOps(Cmp, Values); + else if (match(Cond, m_NUWTrunc(m_Value(Op0)))) + Values.push_back(Op0); for (Value *V : Values) { if (shouldRename(V)) { @@ -506,23 +514,10 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter, ValInfo->RenamedOp = (RenameStack.end() - Start) == RenameStack.begin() ? OrigOp : (RenameStack.end() - Start - 1)->Def; - auto CreateSSACopy = [this](IRBuilderBase &B, Value *Op, - const Twine &Name = "") { - auto It = PI.DeclarationCache.try_emplace(Op->getType()); - if (It.second) { - // The number of named values is used to detect if a new declaration - // was added. If so, that declaration is tracked so that it can be - // removed when the analysis is done. The corner case were a new - // declaration results in a name clash and the old name being renamed - // is not considered as that represents an invalid module. - auto NumDecls = F.getParent()->getNumNamedValues(); - Function *IF = Intrinsic::getOrInsertDeclaration( - F.getParent(), Intrinsic::ssa_copy, Op->getType()); - if (NumDecls != F.getParent()->getNumNamedValues()) - PI.CreatedDeclarations.insert(IF); - It.first->second = IF; - } - return B.CreateCall(It.first->second, Op, Name); + auto CreateSSACopy = [](Instruction *InsertPt, Value *Op, + const Twine &Name = "") { + // Use a no-op bitcast to represent ssa copy. + return new BitCastInst(Op, Op->getType(), Name, InsertPt->getIterator()); }; // For edge predicates, we can just place the operand in the block before // the terminator. For assume, we have to place it right after the assume @@ -530,9 +525,8 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter, // right before the terminator or after the assume, so that we insert in // proper order in the case of multiple predicateinfo in the same block. if (isa<PredicateWithEdge>(ValInfo)) { - IRBuilder<> B(getBranchTerminator(ValInfo)); - CallInst *PIC = - CreateSSACopy(B, Op, Op->getName() + "." + Twine(Counter++)); + BitCastInst *PIC = CreateSSACopy(getBranchTerminator(ValInfo), Op, + Op->getName() + "." + Twine(Counter++)); PI.PredicateMap.insert({PIC, ValInfo}); Result.Def = PIC; } else { @@ -541,8 +535,7 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter, "Should not have gotten here without it being an assume"); // Insert the predicate directly after the assume. While it also holds // directly before it, assume(i1 true) is not a useful fact. - IRBuilder<> B(PAssume->AssumeInst->getNextNode()); - CallInst *PIC = CreateSSACopy(B, Op); + BitCastInst *PIC = CreateSSACopy(PAssume->AssumeInst->getNextNode(), Op); PI.PredicateMap.insert({PIC, ValInfo}); Result.Def = PIC; } @@ -710,23 +703,6 @@ PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT, Builder.buildPredicateInfo(); } -// Remove all declarations we created . The PredicateInfo consumers are -// responsible for remove the ssa_copy calls created. -PredicateInfo::~PredicateInfo() { - // Collect function pointers in set first, as SmallSet uses a SmallVector - // internally and we have to remove the asserting value handles first. - SmallPtrSet<Function *, 20> FunctionPtrs; - for (const auto &F : CreatedDeclarations) - FunctionPtrs.insert(&*F); - CreatedDeclarations.clear(); - - for (Function *F : FunctionPtrs) { - assert(F->users().empty() && - "PredicateInfo consumer did not remove all SSA copies."); - F->eraseFromParent(); - } -} - std::optional<PredicateConstraint> PredicateBase::getConstraint() const { switch (Type) { case PT_Assume: @@ -741,6 +717,11 @@ std::optional<PredicateConstraint> PredicateBase::getConstraint() const { : ConstantInt::getFalse(Condition->getType())}}; } + if (match(Condition, m_NUWTrunc(m_Specific(RenamedOp)))) { + return {{TrueEdge ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ, + ConstantInt::getNullValue(RenamedOp->getType())}}; + } + CmpInst *Cmp = dyn_cast<CmpInst>(Condition); if (!Cmp) { // TODO: Make this an assertion once RenamedOp is fully accurate. @@ -779,15 +760,16 @@ std::optional<PredicateConstraint> PredicateBase::getConstraint() const { void PredicateInfo::verifyPredicateInfo() const {} -// Replace ssa_copy calls created by PredicateInfo with their operand. +// Replace bitcasts created by PredicateInfo with their operand. static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) { for (Instruction &Inst : llvm::make_early_inc_range(instructions(F))) { const auto *PI = PredInfo.getPredicateInfoFor(&Inst); - auto *II = dyn_cast<IntrinsicInst>(&Inst); - if (!PI || !II || II->getIntrinsicID() != Intrinsic::ssa_copy) + if (!PI) continue; - Inst.replaceAllUsesWith(II->getOperand(0)); + assert(isa<BitCastInst>(Inst) && + Inst.getType() == Inst.getOperand(0)->getType()); + Inst.replaceAllUsesWith(Inst.getOperand(0)); Inst.eraseFromParent(); } } diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp index 0ffea3f..41647f7 100644 --- a/llvm/lib/Transforms/Utils/ProfileVerify.cpp +++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp @@ -8,10 +8,8 @@ #include "llvm/Transforms/Utils/ProfileVerify.h" #include "llvm/ADT/DynamicAPInt.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/BranchProbabilityInfo.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Analysis.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index d96f1d6..10c162b 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -136,7 +136,7 @@ public: /// \p ToDelete that stores to this alloca. void updateForDeletedStore( StoreInst *ToDelete, DIBuilder &DIB, - SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) const { + SmallPtrSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) const { // There's nothing to do if the alloca doesn't have any variables using // assignment tracking. if (DVRAssigns.empty()) @@ -382,7 +382,7 @@ struct PromoteMem2Reg { SmallVector<AssignmentTrackingInfo, 8> AllocaATInfo; /// A set of dbg.assigns to delete because they've been demoted to /// dbg.values. Call cleanUpDbgAssigns to delete them. - SmallSet<DbgVariableRecord *, 8> DVRAssignsToDelete; + SmallPtrSet<DbgVariableRecord *, 8> DVRAssignsToDelete; /// The set of basic blocks the renamer has already visited. BitVector Visited; @@ -533,11 +533,10 @@ static void removeIntrinsicUsers(AllocaInst *AI) { /// false there were some loads which were not dominated by the single store /// and thus must be phi-ed with undef. We fall back to the standard alloca /// promotion algorithm in that case. -static bool -rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, - const DataLayout &DL, DominatorTree &DT, - AssumptionCache *AC, - SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) { +static bool rewriteSingleStoreAlloca( + AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, const DataLayout &DL, + DominatorTree &DT, AssumptionCache *AC, + SmallPtrSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) { StoreInst *OnlyStore = Info.OnlyStore; Value *ReplVal = OnlyStore->getOperand(0); // Loads may either load the stored value or uninitialized memory (undef). @@ -647,11 +646,10 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, /// use(t); /// *A = 42; /// } -static bool -promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, - LargeBlockInfo &LBI, const DataLayout &DL, - DominatorTree &DT, AssumptionCache *AC, - SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) { +static bool promoteSingleBlockAlloca( + AllocaInst *AI, const AllocaInfo &Info, LargeBlockInfo &LBI, + const DataLayout &DL, DominatorTree &DT, AssumptionCache *AC, + SmallPtrSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) { // The trickiest case to handle is when we have large blocks. Because of this, // this code is optimized assuming that large blocks happen. This does not // significantly pessimize the small block case. This uses LargeBlockInfo to diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index b78c702..8448517 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -777,10 +777,10 @@ public: for (BasicBlock &BB : F) { for (Instruction &Inst : llvm::make_early_inc_range(BB)) { - if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) { - if (II->getIntrinsicID() == Intrinsic::ssa_copy) { + if (auto *BC = dyn_cast<BitCastInst>(&Inst)) { + if (BC->getType() == BC->getOperand(0)->getType()) { if (It->second->getPredicateInfoFor(&Inst)) { - Value *Op = II->getOperand(0); + Value *Op = BC->getOperand(0); Inst.replaceAllUsesWith(Op); Inst.eraseFromParent(); } @@ -1413,6 +1413,15 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) { if (ValueState[&I].isOverdefined()) return; + if (auto *BC = dyn_cast<BitCastInst>(&I)) { + if (BC->getType() == BC->getOperand(0)->getType()) { + if (const PredicateBase *PI = getPredicateInfoFor(&I)) { + handlePredicate(&I, I.getOperand(0), PI); + return; + } + } + } + ValueLatticeElement OpSt = getValueState(I.getOperand(0)); if (OpSt.isUnknownOrUndef()) return; @@ -1433,8 +1442,12 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) { OpSt.asConstantRange(I.getSrcTy(), /*UndefAllowed=*/false); Type *DestTy = I.getDestTy(); - ConstantRange Res = - OpRange.castOp(I.getOpcode(), DestTy->getScalarSizeInBits()); + ConstantRange Res = ConstantRange::getEmpty(DestTy->getScalarSizeInBits()); + if (auto *Trunc = dyn_cast<TruncInst>(&I)) + Res = OpRange.truncate(DestTy->getScalarSizeInBits(), + Trunc->getNoWrapKind()); + else + Res = OpRange.castOp(I.getOpcode(), DestTy->getScalarSizeInBits()); mergeInValue(LV, &I, ValueLatticeElement::getRange(Res)); } else markOverdefined(&I); @@ -2001,17 +2014,6 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { Function *F = CB.getCalledFunction(); if (auto *II = dyn_cast<IntrinsicInst>(&CB)) { - if (II->getIntrinsicID() == Intrinsic::ssa_copy) { - if (ValueState[&CB].isOverdefined()) - return; - - Value *CopyOf = CB.getOperand(0); - const PredicateBase *PI = getPredicateInfoFor(&CB); - assert(PI && "Missing predicate info for ssa.copy"); - handlePredicate(&CB, CopyOf, PI); - return; - } - if (II->getIntrinsicID() == Intrinsic::vscale) { unsigned BitWidth = CB.getType()->getScalarSizeInBits(); const ConstantRange Result = getVScaleRange(II->getFunction(), BitWidth); diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 1eb8996..e218db3 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -1346,7 +1346,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { CanonicalIV->insertBefore(Header->begin()); rememberInstruction(CanonicalIV); - SmallSet<BasicBlock *, 4> PredSeen; + SmallPtrSet<BasicBlock *, 4> PredSeen; Constant *One = ConstantInt::get(Ty, 1); for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) { BasicBlock *HP = *HPI; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index deabacc..055e8ca 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -291,6 +291,7 @@ class SimplifyCFGOpt { bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder); bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder); bool simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder); + bool foldCondBranchOnValueKnownInPredecessor(BranchInst *BI); bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, IRBuilder<> &Builder); @@ -564,6 +565,9 @@ struct ConstantComparesGatherer { /// Number of comparisons matched in the and/or chain unsigned UsedICmps = 0; + /// If the elements in Vals matches the comparisons + bool IsEq = false; + /// Construct and compute the result for the comparison instruction Cond ConstantComparesGatherer(Instruction *Cond, const DataLayout &DL) : DL(DL) { gather(Cond); @@ -735,23 +739,23 @@ private: /// vector. /// One "Extra" case is allowed to differ from the other. void gather(Value *V) { - bool isEQ = match(V, m_LogicalOr(m_Value(), m_Value())); - + Value *Op0, *Op1; + if (match(V, m_LogicalOr(m_Value(Op0), m_Value(Op1)))) + IsEq = true; + else if (match(V, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) + IsEq = false; + else + return; // Keep a stack (SmallVector for efficiency) for depth-first traversal - SmallVector<Value *, 8> DFT; - SmallPtrSet<Value *, 8> Visited; - - // Initialize - Visited.insert(V); - DFT.push_back(V); + SmallVector<Value *, 8> DFT{Op0, Op1}; + SmallPtrSet<Value *, 8> Visited{V, Op0, Op1}; while (!DFT.empty()) { V = DFT.pop_back_val(); if (Instruction *I = dyn_cast<Instruction>(V)) { // If it is a || (or && depending on isEQ), process the operands. - Value *Op0, *Op1; - if (isEQ ? match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))) + if (IsEq ? match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))) : match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) { if (Visited.insert(Op1).second) DFT.push_back(Op1); @@ -762,7 +766,7 @@ private: } // Try to match the current instruction - if (matchInstruction(I, isEQ)) + if (matchInstruction(I, IsEq)) // Match succeed, continue the loop continue; } @@ -810,11 +814,15 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) { if (!SI->getParent()->hasNPredecessorsOrMore(128 / SI->getNumSuccessors())) CV = SI->getCondition(); } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) - if (BI->isConditional() && BI->getCondition()->hasOneUse()) + if (BI->isConditional() && BI->getCondition()->hasOneUse()) { if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) { if (ICI->isEquality() && getConstantInt(ICI->getOperand(1), DL)) CV = ICI->getOperand(0); + } else if (auto *Trunc = dyn_cast<TruncInst>(BI->getCondition())) { + if (Trunc->hasNoUnsignedWrap()) + CV = Trunc->getOperand(0); } + } // Unwrap any lossless ptrtoint cast. if (CV) { @@ -840,11 +848,20 @@ BasicBlock *SimplifyCFGOpt::getValueEqualityComparisonCases( } BranchInst *BI = cast<BranchInst>(TI); - ICmpInst *ICI = cast<ICmpInst>(BI->getCondition()); - BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE); - Cases.push_back(ValueEqualityComparisonCase( - getConstantInt(ICI->getOperand(1), DL), Succ)); - return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ); + Value *Cond = BI->getCondition(); + ICmpInst::Predicate Pred; + ConstantInt *C; + if (auto *ICI = dyn_cast<ICmpInst>(Cond)) { + Pred = ICI->getPredicate(); + C = getConstantInt(ICI->getOperand(1), DL); + } else { + Pred = ICmpInst::ICMP_NE; + auto *Trunc = cast<TruncInst>(Cond); + C = ConstantInt::get(cast<IntegerType>(Trunc->getOperand(0)->getType()), 0); + } + BasicBlock *Succ = BI->getSuccessor(Pred == ICmpInst::ICMP_NE); + Cases.push_back(ValueEqualityComparisonCase(C, Succ)); + return BI->getSuccessor(Pred == ICmpInst::ICMP_EQ); } /// Given a vector of bb/value pairs, remove any entries @@ -1106,7 +1123,10 @@ static void getBranchWeights(Instruction *TI, // default weight to be the first entry. if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { assert(Weights.size() == 2); - ICmpInst *ICI = cast<ICmpInst>(BI->getCondition()); + auto *ICI = dyn_cast<ICmpInst>(BI->getCondition()); + if (!ICI) + return; + if (ICI->getPredicate() == ICmpInst::ICMP_EQ) std::swap(Weights.front(), Weights.back()); } @@ -3321,12 +3341,10 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI, // %merge = select %cond, %two, %one // store %merge, %x.dest, !DIAssignID !2 // dbg.assign %merge, "x", ..., !2 - auto replaceVariable = [OrigV, S](auto *DbgAssign) { + for (DbgVariableRecord *DbgAssign : + at::getDVRAssignmentMarkers(SpeculatedStore)) if (llvm::is_contained(DbgAssign->location_ops(), OrigV)) DbgAssign->replaceVariableLocationOp(OrigV, S); - }; - for_each(at::getAssignmentMarkers(SpeculatedStore), replaceVariable); - for_each(at::getDVRAssignmentMarkers(SpeculatedStore), replaceVariable); } // Metadata can be dependent on the condition we are hoisting above. @@ -3655,15 +3673,19 @@ foldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU, return false; } -static bool foldCondBranchOnValueKnownInPredecessor(BranchInst *BI, - DomTreeUpdater *DTU, - const DataLayout &DL, - AssumptionCache *AC) { +bool SimplifyCFGOpt::foldCondBranchOnValueKnownInPredecessor(BranchInst *BI) { + // Note: If BB is a loop header then there is a risk that threading introduces + // a non-canonical loop by moving a back edge. So we avoid this optimization + // for loop headers if NeedCanonicalLoop is set. + if (Options.NeedCanonicalLoop && is_contained(LoopHeaders, BI->getParent())) + return false; + std::optional<bool> Result; bool EverChanged = false; do { // Note that None means "we changed things, but recurse further." - Result = foldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, AC); + Result = + foldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, Options.AC); EverChanged |= Result == std::nullopt || *Result; } while (Result == std::nullopt); return EverChanged; @@ -5084,6 +5106,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, Value *CompVal = ConstantCompare.CompValue; unsigned UsedICmps = ConstantCompare.UsedICmps; Value *ExtraCase = ConstantCompare.Extra; + bool TrueWhenEqual = ConstantCompare.IsEq; // If we didn't have a multiply compared value, fail. if (!CompVal) @@ -5093,8 +5116,6 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, if (UsedICmps <= 1) return false; - bool TrueWhenEqual = match(Cond, m_LogicalOr(m_Value(), m_Value())); - // There might be duplicate constants in the list, which the switch // instruction can't handle, remove them now. array_pod_sort(Values.begin(), Values.end(), constantIntSortPredicate); @@ -8085,7 +8106,7 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { // If this is a branch on something for which we know the constant value in // predecessors (e.g. a phi node in the current block), thread control // through this block. - if (foldCondBranchOnValueKnownInPredecessor(BI, DTU, DL, Options.AC)) + if (foldCondBranchOnValueKnownInPredecessor(BI)) return requestResimplify(); // Scan predecessor blocks for conditional branches. diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 737321d..2d6a748 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/AttributeMask.h" #include "llvm/IR/DataLayout.h" @@ -319,10 +320,10 @@ static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> A annotateDereferenceableBytes(CI, ArgNos, LenC->getZExtValue()); } else if (isKnownNonZero(Size, DL)) { annotateNonNullNoUndefBasedOnAccess(CI, ArgNos); - const APInt *X, *Y; + uint64_t X, Y; uint64_t DerefMin = 1; - if (match(Size, m_Select(m_Value(), m_APInt(X), m_APInt(Y)))) { - DerefMin = std::min(X->getZExtValue(), Y->getZExtValue()); + if (match(Size, m_Select(m_Value(), m_ConstantInt(X), m_ConstantInt(Y)))) { + DerefMin = std::min(X, Y); annotateDereferenceableBytes(CI, ArgNos, DerefMin); } } @@ -977,8 +978,14 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B, // it's not very useful because calling strlen for a pointer of other types is // very uncommon. if (GEPOperator *GEP = dyn_cast<GEPOperator>(Src)) { - // TODO: Handle subobjects. - if (!isGEPBasedOnPointerToString(GEP, CharSize)) + unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType()); + SmallMapVector<Value *, APInt, 4> VarOffsets; + APInt ConstOffset(BW, 0); + assert(CharSize % 8 == 0 && "Expected a multiple of 8 sized CharSize"); + // Check the gep is a single variable offset. + if (!GEP->collectOffset(DL, BW, VarOffsets, ConstOffset) || + VarOffsets.size() != 1 || ConstOffset != 0 || + VarOffsets.begin()->second != CharSize / 8) return nullptr; ConstantDataArraySlice Slice; @@ -1000,10 +1007,8 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B, return nullptr; } - Value *Offset = GEP->getOperand(2); + Value *Offset = VarOffsets.begin()->first; KnownBits Known = computeKnownBits(Offset, DL, nullptr, CI, nullptr); - uint64_t ArrSize = - cast<ArrayType>(GEP->getSourceElementType())->getNumElements(); // If Offset is not provably in the range [0, NullTermIdx], we can still // optimize if we can prove that the program has undefined behavior when @@ -1011,7 +1016,7 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B, // is a pointer to an object whose memory extent is NullTermIdx+1. if ((Known.isNonNegative() && Known.getMaxValue().ule(NullTermIdx)) || (isa<GlobalVariable>(GEP->getOperand(0)) && - NullTermIdx == ArrSize - 1)) { + NullTermIdx == Slice.Length - 1)) { Offset = B.CreateSExtOrTrunc(Offset, CI->getType()); return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx), Offset); diff --git a/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp b/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp index 6b18ece..c3ac39e 100644 --- a/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp +++ b/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp @@ -12,7 +12,6 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" @@ -21,7 +20,6 @@ #include "llvm/Transforms/Utils/Cloning.h" #include <map> -#include <string> #include <utility> using namespace llvm; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index c47fd942..789047a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -793,280 +793,296 @@ static bool canWidenCallReturnType(Type *Ty) { } bool LoopVectorizationLegality::canVectorizeInstrs() { - BasicBlock *Header = TheLoop->getHeader(); + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + bool Result = true; // For each block in the loop. for (BasicBlock *BB : TheLoop->blocks()) { // Scan the instructions in the block and look for hazards. for (Instruction &I : *BB) { - if (auto *Phi = dyn_cast<PHINode>(&I)) { - Type *PhiTy = Phi->getType(); - // Check that this PHI type is allowed. - if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && - !PhiTy->isPointerTy()) { - reportVectorizationFailure("Found a non-int non-pointer PHI", - "loop control flow is not understood by vectorizer", - "CFGNotUnderstood", ORE, TheLoop); - return false; - } + Result &= canVectorizeInstr(I); + if (!DoExtraAnalysis && !Result) + return false; + } + } - // If this PHINode is not in the header block, then we know that we - // can convert it to select during if-conversion. No need to check if - // the PHIs in this block are induction or reduction variables. - if (BB != Header) { - // Non-header phi nodes that have outside uses can be vectorized. Add - // them to the list of allowed exits. - // Unsafe cyclic dependencies with header phis are identified during - // legalization for reduction, induction and fixed order - // recurrences. - AllowedExit.insert(&I); - continue; - } + if (!PrimaryInduction) { + if (Inductions.empty()) { + reportVectorizationFailure( + "Did not find one integer induction var", + "loop induction variable could not be identified", + "NoInductionVariable", ORE, TheLoop); + return false; + } + if (!WidestIndTy) { + reportVectorizationFailure( + "Did not find one integer induction var", + "integer loop induction variable could not be identified", + "NoIntegerInductionVariable", ORE, TheLoop); + return false; + } + LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); + } - // We only allow if-converted PHIs with exactly two incoming values. - if (Phi->getNumIncomingValues() != 2) { - reportVectorizationFailure("Found an invalid PHI", - "loop control flow is not understood by vectorizer", - "CFGNotUnderstood", ORE, TheLoop, Phi); - return false; - } + // Now we know the widest induction type, check if our found induction + // is the same size. If it's not, unset it here and InnerLoopVectorizer + // will create another. + if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) + PrimaryInduction = nullptr; - RecurrenceDescriptor RedDes; - if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, - DT, PSE.getSE())) { - Requirements->addExactFPMathInst(RedDes.getExactFPMathInst()); - AllowedExit.insert(RedDes.getLoopExitInstr()); - Reductions[Phi] = RedDes; - continue; - } + return Result; +} - // We prevent matching non-constant strided pointer IVS to preserve - // historical vectorizer behavior after a generalization of the - // IVDescriptor code. The intent is to remove this check, but we - // have to fix issues around code quality for such loops first. - auto IsDisallowedStridedPointerInduction = - [](const InductionDescriptor &ID) { - if (AllowStridedPointerIVs) - return false; - return ID.getKind() == InductionDescriptor::IK_PtrInduction && - ID.getConstIntStepValue() == nullptr; - }; - - // TODO: Instead of recording the AllowedExit, it would be good to - // record the complementary set: NotAllowedExit. These include (but may - // not be limited to): - // 1. Reduction phis as they represent the one-before-last value, which - // is not available when vectorized - // 2. Induction phis and increment when SCEV predicates cannot be used - // outside the loop - see addInductionPhi - // 3. Non-Phis with outside uses when SCEV predicates cannot be used - // outside the loop - see call to hasOutsideLoopUser in the non-phi - // handling below - // 4. FixedOrderRecurrence phis that can possibly be handled by - // extraction. - // By recording these, we can then reason about ways to vectorize each - // of these NotAllowedExit. - InductionDescriptor ID; - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) && - !IsDisallowedStridedPointerInduction(ID)) { - addInductionPhi(Phi, ID, AllowedExit); - Requirements->addExactFPMathInst(ID.getExactFPMathInst()); - continue; - } +bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) { + BasicBlock *BB = I.getParent(); + BasicBlock *Header = TheLoop->getHeader(); - if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) { - AllowedExit.insert(Phi); - FixedOrderRecurrences.insert(Phi); - continue; - } + if (auto *Phi = dyn_cast<PHINode>(&I)) { + Type *PhiTy = Phi->getType(); + // Check that this PHI type is allowed. + if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && + !PhiTy->isPointerTy()) { + reportVectorizationFailure( + "Found a non-int non-pointer PHI", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood", ORE, TheLoop); + return false; + } - // As a last resort, coerce the PHI to a AddRec expression - // and re-try classifying it a an induction PHI. - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) && - !IsDisallowedStridedPointerInduction(ID)) { - addInductionPhi(Phi, ID, AllowedExit); - continue; - } + // If this PHINode is not in the header block, then we know that we + // can convert it to select during if-conversion. No need to check if + // the PHIs in this block are induction or reduction variables. + if (BB != Header) { + // Non-header phi nodes that have outside uses can be vectorized. Add + // them to the list of allowed exits. + // Unsafe cyclic dependencies with header phis are identified during + // legalization for reduction, induction and fixed order + // recurrences. + AllowedExit.insert(&I); + return true; + } - reportVectorizationFailure("Found an unidentified PHI", - "value that could not be identified as " - "reduction is used outside the loop", - "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi); - return false; - } // end of PHI handling - - // We handle calls that: - // * Have a mapping to an IR intrinsic. - // * Have a vector version available. - auto *CI = dyn_cast<CallInst>(&I); - - if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && - !(CI->getCalledFunction() && TLI && - (!VFDatabase::getMappings(*CI).empty() || - isTLIScalarize(*TLI, *CI)))) { - // If the call is a recognized math libary call, it is likely that - // we can vectorize it given loosened floating-point constraints. - LibFunc Func; - bool IsMathLibCall = - TLI && CI->getCalledFunction() && - CI->getType()->isFloatingPointTy() && - TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) && - TLI->hasOptimizedCodeGen(Func); - - if (IsMathLibCall) { - // TODO: Ideally, we should not use clang-specific language here, - // but it's hard to provide meaningful yet generic advice. - // Also, should this be guarded by allowExtraAnalysis() and/or be part - // of the returned info from isFunctionVectorizable()? - reportVectorizationFailure( - "Found a non-intrinsic callsite", - "library call cannot be vectorized. " - "Try compiling with -fno-math-errno, -ffast-math, " - "or similar flags", - "CantVectorizeLibcall", ORE, TheLoop, CI); - } else { - reportVectorizationFailure("Found a non-intrinsic callsite", - "call instruction cannot be vectorized", - "CantVectorizeLibcall", ORE, TheLoop, CI); - } - return false; - } + // We only allow if-converted PHIs with exactly two incoming values. + if (Phi->getNumIncomingValues() != 2) { + reportVectorizationFailure( + "Found an invalid PHI", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood", ORE, TheLoop, Phi); + return false; + } - // Some intrinsics have scalar arguments and should be same in order for - // them to be vectorized (i.e. loop invariant). - if (CI) { - auto *SE = PSE.getSE(); - Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); - for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx) - if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) { - if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)), - TheLoop)) { - reportVectorizationFailure("Found unvectorizable intrinsic", - "intrinsic instruction cannot be vectorized", - "CantVectorizeIntrinsic", ORE, TheLoop, CI); - return false; - } - } - } + RecurrenceDescriptor RedDes; + if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, DT, + PSE.getSE())) { + Requirements->addExactFPMathInst(RedDes.getExactFPMathInst()); + AllowedExit.insert(RedDes.getLoopExitInstr()); + Reductions[Phi] = RedDes; + return true; + } - // If we found a vectorized variant of a function, note that so LV can - // make better decisions about maximum VF. - if (CI && !VFDatabase::getMappings(*CI).empty()) - VecCallVariantsFound = true; - - auto CanWidenInstructionTy = [](Instruction const &Inst) { - Type *InstTy = Inst.getType(); - if (!isa<StructType>(InstTy)) - return canVectorizeTy(InstTy); - - // For now, we only recognize struct values returned from calls where - // all users are extractvalue as vectorizable. All element types of the - // struct must be types that can be widened. - return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) && - all_of(Inst.users(), IsaPred<ExtractValueInst>); - }; + // We prevent matching non-constant strided pointer IVS to preserve + // historical vectorizer behavior after a generalization of the + // IVDescriptor code. The intent is to remove this check, but we + // have to fix issues around code quality for such loops first. + auto IsDisallowedStridedPointerInduction = + [](const InductionDescriptor &ID) { + if (AllowStridedPointerIVs) + return false; + return ID.getKind() == InductionDescriptor::IK_PtrInduction && + ID.getConstIntStepValue() == nullptr; + }; + + // TODO: Instead of recording the AllowedExit, it would be good to + // record the complementary set: NotAllowedExit. These include (but may + // not be limited to): + // 1. Reduction phis as they represent the one-before-last value, which + // is not available when vectorized + // 2. Induction phis and increment when SCEV predicates cannot be used + // outside the loop - see addInductionPhi + // 3. Non-Phis with outside uses when SCEV predicates cannot be used + // outside the loop - see call to hasOutsideLoopUser in the non-phi + // handling below + // 4. FixedOrderRecurrence phis that can possibly be handled by + // extraction. + // By recording these, we can then reason about ways to vectorize each + // of these NotAllowedExit. + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) && + !IsDisallowedStridedPointerInduction(ID)) { + addInductionPhi(Phi, ID, AllowedExit); + Requirements->addExactFPMathInst(ID.getExactFPMathInst()); + return true; + } - // Check that the instruction return type is vectorizable. - // We can't vectorize casts from vector type to scalar type. - // Also, we can't vectorize extractelement instructions. - if (!CanWidenInstructionTy(I) || - (isa<CastInst>(I) && - !VectorType::isValidElementType(I.getOperand(0)->getType())) || - isa<ExtractElementInst>(I)) { - reportVectorizationFailure("Found unvectorizable type", - "instruction return type cannot be vectorized", - "CantVectorizeInstructionReturnType", ORE, TheLoop, &I); - return false; - } + if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) { + AllowedExit.insert(Phi); + FixedOrderRecurrences.insert(Phi); + return true; + } + + // As a last resort, coerce the PHI to a AddRec expression + // and re-try classifying it a an induction PHI. + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) && + !IsDisallowedStridedPointerInduction(ID)) { + addInductionPhi(Phi, ID, AllowedExit); + return true; + } - // Check that the stored type is vectorizable. - if (auto *ST = dyn_cast<StoreInst>(&I)) { - Type *T = ST->getValueOperand()->getType(); - if (!VectorType::isValidElementType(T)) { - reportVectorizationFailure("Store instruction cannot be vectorized", - "CantVectorizeStore", ORE, TheLoop, ST); + reportVectorizationFailure("Found an unidentified PHI", + "value that could not be identified as " + "reduction is used outside the loop", + "NonReductionValueUsedOutsideLoop", ORE, TheLoop, + Phi); + return false; + } // end of PHI handling + + // We handle calls that: + // * Have a mapping to an IR intrinsic. + // * Have a vector version available. + auto *CI = dyn_cast<CallInst>(&I); + + if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && + !(CI->getCalledFunction() && TLI && + (!VFDatabase::getMappings(*CI).empty() || isTLIScalarize(*TLI, *CI)))) { + // If the call is a recognized math libary call, it is likely that + // we can vectorize it given loosened floating-point constraints. + LibFunc Func; + bool IsMathLibCall = + TLI && CI->getCalledFunction() && CI->getType()->isFloatingPointTy() && + TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) && + TLI->hasOptimizedCodeGen(Func); + + if (IsMathLibCall) { + // TODO: Ideally, we should not use clang-specific language here, + // but it's hard to provide meaningful yet generic advice. + // Also, should this be guarded by allowExtraAnalysis() and/or be part + // of the returned info from isFunctionVectorizable()? + reportVectorizationFailure( + "Found a non-intrinsic callsite", + "library call cannot be vectorized. " + "Try compiling with -fno-math-errno, -ffast-math, " + "or similar flags", + "CantVectorizeLibcall", ORE, TheLoop, CI); + } else { + reportVectorizationFailure("Found a non-intrinsic callsite", + "call instruction cannot be vectorized", + "CantVectorizeLibcall", ORE, TheLoop, CI); + } + return false; + } + + // Some intrinsics have scalar arguments and should be same in order for + // them to be vectorized (i.e. loop invariant). + if (CI) { + auto *SE = PSE.getSE(); + Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); + for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx) + if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) { + if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)), TheLoop)) { + reportVectorizationFailure( + "Found unvectorizable intrinsic", + "intrinsic instruction cannot be vectorized", + "CantVectorizeIntrinsic", ORE, TheLoop, CI); return false; } + } + } - // For nontemporal stores, check that a nontemporal vector version is - // supported on the target. - if (ST->getMetadata(LLVMContext::MD_nontemporal)) { - // Arbitrarily try a vector of 2 elements. - auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2); - assert(VecTy && "did not find vectorized version of stored type"); - if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) { - reportVectorizationFailure( - "nontemporal store instruction cannot be vectorized", - "CantVectorizeNontemporalStore", ORE, TheLoop, ST); - return false; - } - } + // If we found a vectorized variant of a function, note that so LV can + // make better decisions about maximum VF. + if (CI && !VFDatabase::getMappings(*CI).empty()) + VecCallVariantsFound = true; + + auto CanWidenInstructionTy = [](Instruction const &Inst) { + Type *InstTy = Inst.getType(); + if (!isa<StructType>(InstTy)) + return canVectorizeTy(InstTy); + + // For now, we only recognize struct values returned from calls where + // all users are extractvalue as vectorizable. All element types of the + // struct must be types that can be widened. + return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) && + all_of(Inst.users(), IsaPred<ExtractValueInst>); + }; - } else if (auto *LD = dyn_cast<LoadInst>(&I)) { - if (LD->getMetadata(LLVMContext::MD_nontemporal)) { - // For nontemporal loads, check that a nontemporal vector version is - // supported on the target (arbitrarily try a vector of 2 elements). - auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2); - assert(VecTy && "did not find vectorized version of load type"); - if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) { - reportVectorizationFailure( - "nontemporal load instruction cannot be vectorized", - "CantVectorizeNontemporalLoad", ORE, TheLoop, LD); - return false; - } - } + // Check that the instruction return type is vectorizable. + // We can't vectorize casts from vector type to scalar type. + // Also, we can't vectorize extractelement instructions. + if (!CanWidenInstructionTy(I) || + (isa<CastInst>(I) && + !VectorType::isValidElementType(I.getOperand(0)->getType())) || + isa<ExtractElementInst>(I)) { + reportVectorizationFailure("Found unvectorizable type", + "instruction return type cannot be vectorized", + "CantVectorizeInstructionReturnType", ORE, + TheLoop, &I); + return false; + } + + // Check that the stored type is vectorizable. + if (auto *ST = dyn_cast<StoreInst>(&I)) { + Type *T = ST->getValueOperand()->getType(); + if (!VectorType::isValidElementType(T)) { + reportVectorizationFailure("Store instruction cannot be vectorized", + "CantVectorizeStore", ORE, TheLoop, ST); + return false; + } - // FP instructions can allow unsafe algebra, thus vectorizable by - // non-IEEE-754 compliant SIMD units. - // This applies to floating-point math operations and calls, not memory - // operations, shuffles, or casts, as they don't change precision or - // semantics. - } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && - !I.isFast()) { - LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); - Hints->setPotentiallyUnsafe(); + // For nontemporal stores, check that a nontemporal vector version is + // supported on the target. + if (ST->getMetadata(LLVMContext::MD_nontemporal)) { + // Arbitrarily try a vector of 2 elements. + auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2); + assert(VecTy && "did not find vectorized version of stored type"); + if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) { + reportVectorizationFailure( + "nontemporal store instruction cannot be vectorized", + "CantVectorizeNontemporalStore", ORE, TheLoop, ST); + return false; } + } - // Reduction instructions are allowed to have exit users. - // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { - // We can safely vectorize loops where instructions within the loop are - // used outside the loop only if the SCEV predicates within the loop is - // same as outside the loop. Allowing the exit means reusing the SCEV - // outside the loop. - if (PSE.getPredicate().isAlwaysTrue()) { - AllowedExit.insert(&I); - continue; - } - reportVectorizationFailure("Value cannot be used outside the loop", - "ValueUsedOutsideLoop", ORE, TheLoop, &I); + } else if (auto *LD = dyn_cast<LoadInst>(&I)) { + if (LD->getMetadata(LLVMContext::MD_nontemporal)) { + // For nontemporal loads, check that a nontemporal vector version is + // supported on the target (arbitrarily try a vector of 2 elements). + auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2); + assert(VecTy && "did not find vectorized version of load type"); + if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) { + reportVectorizationFailure( + "nontemporal load instruction cannot be vectorized", + "CantVectorizeNontemporalLoad", ORE, TheLoop, LD); return false; } - } // next instr. + } + + // FP instructions can allow unsafe algebra, thus vectorizable by + // non-IEEE-754 compliant SIMD units. + // This applies to floating-point math operations and calls, not memory + // operations, shuffles, or casts, as they don't change precision or + // semantics. + } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && + !I.isFast()) { + LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); + Hints->setPotentiallyUnsafe(); } - if (!PrimaryInduction) { - if (Inductions.empty()) { - reportVectorizationFailure("Did not find one integer induction var", - "loop induction variable could not be identified", - "NoInductionVariable", ORE, TheLoop); - return false; - } - if (!WidestIndTy) { - reportVectorizationFailure("Did not find one integer induction var", - "integer loop induction variable could not be identified", - "NoIntegerInductionVariable", ORE, TheLoop); - return false; + // Reduction instructions are allowed to have exit users. + // All other instructions must not have external users. + if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { + // We can safely vectorize loops where instructions within the loop are + // used outside the loop only if the SCEV predicates within the loop is + // same as outside the loop. Allowing the exit means reusing the SCEV + // outside the loop. + if (PSE.getPredicate().isAlwaysTrue()) { + AllowedExit.insert(&I); + return true; } - LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); + reportVectorizationFailure("Value cannot be used outside the loop", + "ValueUsedOutsideLoop", ORE, TheLoop, &I); + return false; } - // Now we know the widest induction type, check if our found induction - // is the same size. If it's not, unset it here and InnerLoopVectorizer - // will create another. - if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) - PrimaryInduction = nullptr; - return true; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 912c893..838476d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -256,13 +256,15 @@ public: new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, GEPNoWrapFlags::none(), DL, Name)); } - VPInstruction *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, - DebugLoc DL = DebugLoc::getUnknown(), - const Twine &Name = "") { - return tryInsertInstruction( - new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, - GEPNoWrapFlags::inBounds(), DL, Name)); + + VPInstruction *createNoWrapPtrAdd(VPValue *Ptr, VPValue *Offset, + GEPNoWrapFlags GEPFlags, + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "") { + return tryInsertInstruction(new VPInstruction( + VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, DL, Name)); } + VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { @@ -276,6 +278,20 @@ public: return tryInsertInstruction(new VPPhi(IncomingValues, DL, Name)); } + VPValue *createElementCount(Type *Ty, ElementCount EC) { + VPlan &Plan = *getInsertBlock()->getPlan(); + VPValue *RuntimeEC = + Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue())); + if (EC.isScalable()) { + VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty); + RuntimeEC = EC.getKnownMinValue() == 1 + ? VScale + : createOverflowingOp(Instruction::Mul, + {VScale, RuntimeEC}, {true, false}); + } + return RuntimeEC; + } + /// Convert the input value \p Current to the corresponding value of an /// induction with \p Start and \p Step values, using \p Start + \p Current * /// \p Step. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index be00fd6..70f8840 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -499,19 +499,18 @@ class InnerLoopVectorizer { public: InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, - const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, ElementCount VecWidth, + ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, VPlan &Plan) - : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), - AC(AC), ORE(ORE), VF(VecWidth), - MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor), - Builder(PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI), - RTChecks(RTChecks), Plan(Plan), - VectorPHVPB(Plan.getVectorLoopRegion()->getSinglePredecessor()) {} + : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC), + VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount), + UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Cost(CM), + BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan), + VectorPHVPBB(cast<VPBasicBlock>( + Plan.getVectorLoopRegion()->getSinglePredecessor())) {} virtual ~InnerLoopVectorizer() = default; @@ -548,9 +547,6 @@ public: protected: friend class LoopVectorizationPlanner; - /// Returns (and creates if needed) the trip count of the widened loop. - Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); - // Create a check to see if the vector loop should be executed Value *createIterationCountCheck(ElementCount VF, unsigned UF) const; @@ -586,18 +582,12 @@ protected: /// Dominator Tree. DominatorTree *DT; - /// Target Library Info. - const TargetLibraryInfo *TLI; - /// Target Transform Info. const TargetTransformInfo *TTI; /// Assumption Cache. AssumptionCache *AC; - /// Interface to emit optimization remarks. - OptimizationRemarkEmitter *ORE; - /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. ElementCount VF; @@ -619,9 +609,6 @@ protected: /// The scalar-loop preheader. BasicBlock *LoopScalarPreHeader = nullptr; - /// Middle Block between the vector and the scalar. - BasicBlock *LoopMiddleBlock = nullptr; - /// Trip count of the original loop. Value *TripCount = nullptr; @@ -648,7 +635,7 @@ protected: /// The vector preheader block of \p Plan, used as target for check blocks /// introduced during skeleton creation. - VPBlockBase *VectorPHVPB; + VPBasicBlock *VectorPHVPBB; }; /// Encapsulate information regarding vectorization of a loop and its epilogue. @@ -686,14 +673,14 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { public: InnerLoopAndEpilogueVectorizer( Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, - DominatorTree *DT, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, - LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, CM, - BFI, PSI, Checks, Plan), + DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, + EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth, + ElementCount MinProfitableTripCount, unsigned UnrollFactor) + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth, + MinProfitableTripCount, UnrollFactor, CM, BFI, PSI, + Checks, Plan), EPI(EPI) {} // Override this function to handle the more complex control flow around the @@ -721,15 +708,17 @@ public: /// epilogues. class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { public: - EpilogueVectorizerMainLoop( - Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, - DominatorTree *DT, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, - LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan) - : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, CM, BFI, PSI, Check, Plan) {} + EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, + LoopInfo *LI, DominatorTree *DT, + const TargetTransformInfo *TTI, + AssumptionCache *AC, + EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Check, VPlan &Plan) + : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM, + BFI, PSI, Check, Plan, EPI.MainLoopVF, + EPI.MainLoopVF, EPI.MainLoopUF) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). BasicBlock *createEpilogueVectorizedLoopSkeleton() final; @@ -750,13 +739,13 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { public: EpilogueVectorizerEpilogueLoop( Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, - DominatorTree *DT, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, - LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan) - : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, CM, BFI, PSI, Checks, Plan) { + DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, + EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Checks, VPlan &Plan) + : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM, + BFI, PSI, Checks, Plan, EPI.EpilogueVF, + EPI.EpilogueVF, EPI.EpilogueUF) { TripCount = EPI.TripCount; } /// Implements the interface for creating a vectorized skeleton using the @@ -835,7 +824,14 @@ namespace llvm { Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step) { assert(Ty->isIntegerTy() && "Expected an integer step"); - return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); + ElementCount VFxStep = VF.multiplyCoefficientBy(Step); + assert(isPowerOf2_64(VF.getKnownMinValue()) && "must pass power-of-2 VF"); + if (VF.isScalable() && isPowerOf2_64(Step)) { + return B.CreateShl( + B.CreateVScale(Ty), + ConstantInt::get(Ty, Log2_64(VFxStep.getKnownMinValue())), "", true); + } + return B.CreateElementCount(Ty, VFxStep); } /// Return the runtime value for VF. @@ -2272,65 +2268,15 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { return TTI.enableMaskedInterleavedAccessVectorization(); } -Value * -InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { - if (VectorTripCount) - return VectorTripCount; - - Value *TC = getTripCount(); - IRBuilder<> Builder(InsertBlock->getTerminator()); - - Type *Ty = TC->getType(); - // This is where we can make the step a runtime constant. - Value *Step = createStepForVF(Builder, Ty, VF, UF); - - // If the tail is to be folded by masking, round the number of iterations N - // up to a multiple of Step instead of rounding down. This is done by first - // adding Step-1 and then rounding down. Note that it's ok if this addition - // overflows: the vector induction variable will eventually wrap to zero given - // that it starts at zero and its Step is a power of two; the loop will then - // exit, with the last early-exit vector comparison also producing all-true. - // For scalable vectors the VF is not guaranteed to be a power of 2, but this - // is accounted for in emitIterationCountCheck that adds an overflow check. - if (Cost->foldTailByMasking()) { - assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && - "VF*UF must be a power of 2 when folding tail by masking"); - TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)), - "n.rnd.up"); - } - - // Now we need to generate the expression for the part of the loop that the - // vectorized body will execute. This is equal to N - (N % Step) if scalar - // iterations are not required for correctness, or N - Step, otherwise. Step - // is equal to the vectorization factor (number of SIMD elements) times the - // unroll factor (number of SIMD instructions). - Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); - - // There are cases where we *must* run at least one iteration in the remainder - // loop. See the cost model for when this can happen. If the step evenly - // divides the trip count, we set the remainder to be equal to the step. If - // the step does not evenly divide the trip count, no adjustment is necessary - // since there will already be scalar iterations. Note that the minimum - // iterations check ensures that N >= Step. - if (Cost->requiresScalarEpilogue(VF.isVector())) { - auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); - R = Builder.CreateSelect(IsZero, Step, R); - } - - VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); - - return VectorTripCount; -} - void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { // Note: The block with the minimum trip-count check is already connected // during earlier VPlan construction. VPBlockBase *ScalarPH = Plan.getScalarPreheader(); - VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor(); + VPBlockBase *PreVectorPH = VectorPHVPBB->getSinglePredecessor(); assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors"); assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor"); VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB); - VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB); + VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPBB, CheckVPIRBB); PreVectorPH = CheckVPIRBB; VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH); PreVectorPH->swapSuccessors(); @@ -2359,7 +2305,10 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF, // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; - IRBuilder<> Builder(TCCheckBlock->getTerminator()); + IRBuilder<InstSimplifyFolder> Builder( + TCCheckBlock->getContext(), + InstSimplifyFolder(TCCheckBlock->getDataLayout())); + Builder.SetInsertPoint(TCCheckBlock->getTerminator()); // If tail is to be folded, vector loop takes care of all iterations. Value *Count = getTripCount(); @@ -2371,7 +2320,7 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF, return createStepForVF(Builder, CountTy, VF, UF); Value *MinProfTC = - createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); + Builder.CreateElementCount(CountTy, MinProfitableTripCount); if (!VF.isScalable()) return MinProfTC; return Builder.CreateBinaryIntrinsic( @@ -2437,16 +2386,20 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. -static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { +static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, + BasicBlock *IRBB) { VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB); - for (auto &R : make_early_inc_range(*VPBB)) { - assert((IRVPBB->empty() || IRVPBB->back().isPhi() || !R.isPhi()) && - "Tried to move phi recipe after a non-phi recipe"); + auto IP = IRVPBB->begin(); + for (auto &R : make_early_inc_range(VPBB->phis())) + R.moveBefore(*IRVPBB, IP); + + for (auto &R : + make_early_inc_range(make_range(VPBB->getFirstNonPhi(), VPBB->end()))) R.moveBefore(*IRVPBB, IRVPBB->end()); - } VPBlockUtils::reassociateBlocks(VPBB, IRVPBB); // VPBB is now dead and will be cleaned up when the plan gets destroyed. + return IRVPBB; } void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { @@ -2549,7 +2502,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // to the scalar loop. emitIterationCountCheck(LoopScalarPreHeader); - replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); + replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader); return LoopVectorPreHeader; } @@ -2680,19 +2633,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Fix widened non-induction PHIs by setting up the PHI operands. fixNonInductionPHIs(State); - // After vectorization, the exit blocks of the original loop will have - // additional predecessors. Invalidate SCEVs for the exit phis in case SE - // looked through single-entry phis. - SmallVector<BasicBlock *> ExitBlocks; - OrigLoop->getExitBlocks(ExitBlocks); - for (BasicBlock *Exit : ExitBlocks) - for (PHINode &PN : Exit->phis()) - PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); - - // Forget the original basic block. - PSE.getSE()->forgetLoop(OrigLoop); - PSE.getSE()->forgetBlockAndLoopDispositions(); - // Don't apply optimizations below when no (vector) loop remains, as they all // require one at the moment. VPBasicBlock *HeaderVPBB = @@ -2734,11 +2674,8 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { PHINode *NewPhi = cast<PHINode>(State.get(VPPhi)); // Make sure the builder has a valid insert point. Builder.SetInsertPoint(NewPhi); - for (unsigned Idx = 0; Idx < VPPhi->getNumIncoming(); ++Idx) { - VPValue *Inc = VPPhi->getIncomingValue(Idx); - const VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx); + for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks()) NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]); - } } } } @@ -3158,6 +3095,12 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( if (Group->isReverse()) return false; + // TODO: Support interleaved access that requires a gap mask for scalable VFs. + bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking || + StoreAccessWithGapsRequiresMasking; + if (VF.isScalable() && NeedsMaskForGaps) + return false; + auto *Ty = getLoadStoreType(I); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); @@ -4069,8 +4012,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( if (VF.isScalar()) continue; - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), - CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { @@ -4178,7 +4120,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI) { assert(VF.isVector() && "Checking a scalar VF?"); - VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); + VPTypeAnalysis TypeInfo(Plan); DenseSet<VPRecipeBase *> EphemeralRecipes; collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes); // Set of already visited types. @@ -4326,8 +4268,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(), - CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -5272,8 +5213,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - InstructionCost Cost = - VF.getFixedValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + InstructionCost Cost = VF.getFixedValue() * TTI.getAddressComputationCost( + PtrTy, SE, PtrSCEV, CostKind); // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. @@ -5344,11 +5285,12 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, assert(Legal->isUniformMemOp(*I, VF)); Type *ValTy = getLoadStoreType(I); + Type *PtrTy = getLoadStorePointerOperand(I)->getType(); auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); if (isa<LoadInst>(I)) { - return TTI.getAddressComputationCost(ValTy) + + return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, CostKind) + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, @@ -5361,13 +5303,13 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent // the actual generated code, which involves extracting the last element of // a scalable vector where the lane to extract is unknown at compile time. - return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, - CostKind) + - (IsLoopInvariantStoreValue - ? 0 - : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - CostKind, VF.getKnownMinValue() - 1)); + InstructionCost Cost = + TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) + + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind); + if (!IsLoopInvariantStoreValue) + Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement, + VectorTy, CostKind, 0); + return Cost; } InstructionCost @@ -5377,8 +5319,9 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); const Value *Ptr = getLoadStorePointerOperand(I); + Type *PtrTy = toVectorTy(Ptr->getType(), VF); - return TTI.getAddressComputationCost(VectorTy) + + return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, CostKind, I); @@ -5613,11 +5556,12 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, // moment. if (VF.isScalar()) { Type *ValTy = getLoadStoreType(I); + Type *PtrTy = getLoadStorePointerOperand(I)->getType(); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); - return TTI.getAddressComputationCost(ValTy) + + return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind, OpInfo, I); } @@ -6976,8 +6920,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, - CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7178,8 +7121,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // simplifications not accounted for in the legacy cost model. If that's the // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, - CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The @@ -7317,10 +7259,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( ++LoopsEarlyExitVectorized; // TODO: Move to VPlan transform stage once the transition to the VPlan-based // cost model is complete for better cost estimates. - VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF, - OrigLoop->getHeader()->getContext()); - VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); + VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF); + VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan); VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan); + VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); bool HasBranchWeights = hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()); if (HasBranchWeights) { @@ -7339,21 +7281,25 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader()); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); - VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType()); + VPlanTransforms::simplifyRecipes(BestVPlan); VPlanTransforms::removeBranchOnConst(BestVPlan); VPlanTransforms::narrowInterleaveGroups( BestVPlan, BestVF, TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); VPlanTransforms::removeDeadRecipes(BestVPlan); - VPlanTransforms::convertToConcreteRecipes(BestVPlan, - *Legal->getWidestInductionType()); + VPlanTransforms::convertToConcreteRecipes(BestVPlan); // Regions are dissolved after optimizing for VF and UF, which completely // removes unneeded loop regions first. VPlanTransforms::dissolveLoopRegions(BestVPlan); // Canonicalize EVL loops after regions are dissolved. VPlanTransforms::canonicalizeEVLLoops(BestVPlan); VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH); + VPlanTransforms::materializeVectorTripCount( + BestVPlan, VectorPH, CM.foldTailByMasking(), + CM.requiresScalarEpilogue(BestVF.isVector())); + VPlanTransforms::materializeVFAndVFxUF(BestVPlan, VectorPH, BestVF); + VPlanTransforms::simplifyRecipes(BestVPlan); // Perform the actual loop transformation. VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan, @@ -7393,12 +7339,28 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( BasicBlock *EntryBB = cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock(); State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); - if (VectorizingEpilogue) - VPlanTransforms::removeDeadRecipes(BestVPlan); + replaceVPBBWithIRVPBB(BestVPlan.getScalarPreheader(), + State.CFG.PrevBB->getSingleSuccessor()); + VPlanTransforms::removeDeadRecipes(BestVPlan); assert(verifyVPlanIsValid(BestVPlan, true /*VerifyLate*/) && "final VPlan is invalid"); + // After vectorization, the exit blocks of the original loop will have + // additional predecessors. Invalidate SCEVs for the exit phis in case SE + // looked through single-entry phis. + ScalarEvolution &SE = *PSE.getSE(); + for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) { + if (Exit->getNumPredecessors() == 0) + continue; + for (VPRecipeBase &PhiR : Exit->phis()) + SE.forgetLcssaPhiWithNewPredecessor( + OrigLoop, cast<PHINode>(&cast<VPIRPhi>(PhiR).getInstruction())); + } + // Forget the original loop and block dispositions. + SE.forgetLoop(OrigLoop); + SE.forgetBlockAndLoopDispositions(); + ILV.printDebugTracesAtStart(); //===------------------------------------------------===// @@ -7409,11 +7371,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // //===------------------------------------------------===// - // 2. Copy and widen instructions from the old loop into the new loop. - BestVPlan.prepareToExecute( - ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State); - replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB); - // Move check blocks to their final position. // TODO: Move as part of VPIRBB execute and update impacted tests. if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second) @@ -7530,7 +7487,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { EPI.MainLoopIterationCountCheck = emitIterationCountCheck(LoopScalarPreHeader, false); - replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); return LoopVectorPreHeader; } @@ -7557,8 +7513,9 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, assert(Bypass && "Expected valid bypass basic block."); Value *Count = getTripCount(); MinProfitableTripCount = ElementCount::getFixed(0); - Value *CheckMinIters = createIterationCountCheck( - ForEpilogue ? EPI.EpilogueVF : VF, ForEpilogue ? EPI.EpilogueUF : UF); + Value *CheckMinIters = + createIterationCountCheck(ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF, + ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF); BasicBlock *const TCCheckBlock = LoopVectorPreHeader; if (!ForEpilogue) @@ -7568,12 +7525,13 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), static_cast<DominatorTree *>(nullptr), LI, nullptr, "vector.ph"); - if (ForEpilogue) { // Save the trip count so we don't have to regenerate it in the // vec.epilog.iter.check. This is safe to do because the trip count // generated here dominates the vector epilog iter check. EPI.TripCount = Count; + } else { + VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader); } BranchInst &BI = @@ -7607,6 +7565,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { BasicBlock *VecEpilogueIterationCountCheck = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI, nullptr, "vec.epilog.iter.check", true); + VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader); + emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, VecEpilogueIterationCountCheck); AdditionalBypassBlock = VecEpilogueIterationCountCheck; @@ -7661,7 +7621,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { Phi->removeIncomingValue(MemCheckBlock); } - replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); return LoopVectorPreHeader; } @@ -7690,11 +7649,11 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - // FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't - // think the MainLoopStep is correct. - unsigned MainLoopStep = UF * VF.getKnownMinValue(); + auto VScale = Cost->getVScaleForTuning(); + unsigned MainLoopStep = + estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale); unsigned EpilogueLoopStep = - EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); + estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale); // We assume the remaining `Count` is equally distributed in // [0, MainLoopStep) // So the probability for `Count < EpilogueLoopStep` should be @@ -8159,7 +8118,7 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { // extends are intended to be lowered along with the reduction itself. // Build up a set of partial reduction ops for efficient use checking. - SmallSet<User *, 4> PartialReductionOps; + SmallPtrSet<User *, 4> PartialReductionOps; for (const auto &[PartialRdx, _] : PartialReductionChains) PartialReductionOps.insert(PartialRdx.ExtendUser); @@ -8435,8 +8394,13 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, LVer.prepareNoAliasMetadata(); } + // Create initial base VPlan0, to serve as common starting point for all + // candidates built later for specific VF ranges. + auto VPlan0 = VPlanTransforms::buildVPlan0( + OrigLoop, *LI, Legal->getWidestInductionType(), + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE); + auto MaxVFTimes2 = MaxVF * 2; - auto VPlan0 = VPlanTransforms::buildPlainCFG(OrigLoop, *LI); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; if (auto Plan = tryToBuildVPlanWithVPRecipes( @@ -8500,7 +8464,7 @@ static VPInstruction *addResumePhiRecipeForInduction( /// \p IVEndValues. static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) { - VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); + VPTypeAnalysis TypeInfo(Plan); auto *ScalarPH = Plan.getScalarPreheader(); auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]); VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); @@ -8675,23 +8639,17 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // visit each basic block after having visited its predecessor basic blocks. // --------------------------------------------------------------------------- - // Create initial VPlan skeleton, having a basic block for the pre-header - // which contains SCEV expansions that need to happen before the CFG is - // modified; a basic block for the vector pre-header, followed by a region for - // the vector loop, followed by the middle basic block. The skeleton vector - // loop region contains a header and latch basic blocks. - bool RequiresScalarEpilogueCheck = LoopVectorizationPlanner::getDecisionAndClampRange( [this](ElementCount VF) { return !CM.requiresScalarEpilogue(VF.isVector()); }, Range); - VPlanTransforms::prepareForVectorization( - *Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck, - CM.foldTailByMasking(), OrigLoop, - getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), - Legal->hasUncountableEarlyExit(), Range); + VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit(), + Range); + VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck, + CM.foldTailByMasking()); + VPlanTransforms::createLoopRegions(*Plan); VPlanTransforms::createExtractsForLiveOuts(*Plan); @@ -8889,8 +8847,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, - CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -8977,11 +8934,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { assert(!OrigLoop->isInnermost()); assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI); - VPlanTransforms::prepareForVectorization( - *Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop, - getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), false, - Range); + auto Plan = VPlanTransforms::buildVPlan0( + OrigLoop, *LI, Legal->getWidestInductionType(), + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE); + VPlanTransforms::handleEarlyExits(*Plan, + /*HasUncountableExit*/ false, Range); + VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true, + /*TailFolded*/ false); + VPlanTransforms::createLoopRegions(*Plan); for (ElementCount VF : Range) @@ -9114,6 +9074,16 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( CurrentLinkI->getFastMathFlags()); LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); VecOp = FMulRecipe; + } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs && + CurrentLinkI->getOpcode() == Instruction::Sub) { + Type *PhiTy = PhiR->getUnderlyingValue()->getType(); + auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0)); + VPWidenRecipe *Sub = new VPWidenRecipe( + Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {}, + VPIRMetadata(), CurrentLinkI->getDebugLoc()); + Sub->setUnderlyingValue(CurrentLinkI); + LinkVPBB->insert(Sub, CurrentLink->getIterator()); + VecOp = Sub; } else { if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { if (isa<VPWidenRecipe>(CurrentLink)) { @@ -9407,13 +9377,6 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) { State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind, cast_if_present<BinaryOperator>(FPBinOp)); DerivedIV->setName(Name); - // If index is the vector trip count, the concrete value will only be set in - // prepareToExecute, leading to missed simplifications, e.g. if it is 0. - // TODO: Remove the special case for the vector trip count once it is computed - // in VPlan and can be used during VPlan simplification. - assert((DerivedIV != Index || - getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) && - "IV didn't need transforming?"); State.set(this, DerivedIV, VPLane(0)); } @@ -9515,8 +9478,8 @@ static bool processLoopInVPlanNativePath( { GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, - VF.Width, 1, &CM, BFI, PSI, Checks, BestPlan); + InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.Width, 1, &CM, + BFI, PSI, Checks, BestPlan); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); @@ -9798,6 +9761,9 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { else if (&*MainScalarPH->begin() != ResumePhi) ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->begin()); } + // Add a user to to make sure the resume phi won't get removed. + VPBuilder(MainScalarPH) + .createNaryOp(VPInstruction::ResumeForEpilogue, ResumePhi); } /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded @@ -10171,8 +10137,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is profitable to vectorize with runtime checks. bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; - VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(), - CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, + CM.CostKind); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, @@ -10223,8 +10189,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { } } else if (IC > 1 && UserIC == 1) { // Tell the user interleaving is beneficial, but it explicitly disabled. - LLVM_DEBUG( - dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); + LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly " + "disabled.\n"); IntDiagMsg = {"InterleavingBeneficialButDisabled", "the cost-model indicates that interleaving is beneficial " "but is explicitly disabled or interleave count is set to 1"}; @@ -10295,7 +10261,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // interleave it. VPlan &BestPlan = LVP.getPlanFor(VF.Width); InnerLoopVectorizer Unroller( - L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), + L, PSE, LI, DT, TTI, AC, ElementCount::getFixed(1), ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan); // TODO: Move to general VPlan pipeline once epilogue loops are also @@ -10330,20 +10296,16 @@ bool LoopVectorizePass::processLoop(Loop *L) { preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, BestEpiPlan); - EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, &CM, BFI, PSI, Checks, - *BestMainPlan); + EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, + BFI, PSI, Checks, *BestMainPlan); auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, false); ++LoopsVectorized; // Second pass vectorizes the epilogue and adjusts the control flow // edges from the first pass. - EPI.MainLoopVF = EPI.EpilogueVF; - EPI.MainLoopUF = EPI.EpilogueUF; - EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, - ORE, EPI, &CM, BFI, PSI, - Checks, BestEpiPlan); + EpilogueVectorizerEpilogueLoop EpilogILV( + L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, PSI, Checks, BestEpiPlan); EpilogILV.setTripCount(MainILV.getTripCount()); preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); @@ -10368,7 +10330,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (!Checks.hasChecks()) DisableRuntimeUnroll = true; } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, + InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.MinProfitableTripCount, IC, &CM, BFI, PSI, Checks, BestPlan); // TODO: Move to general VPlan pipeline once epilogue loops are also diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 39011e7..37dc414 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -525,17 +525,17 @@ static bool isSplat(ArrayRef<Value *> VL) { /// instructions, we need to use the converted opcode along with the original /// uses. /// \param I The instruction to check for commutativity -/// \param InstWithUses The instruction whose uses are analyzed for special +/// \param ValWithUses The value whose uses are analyzed for special /// patterns -static bool isCommutative(Instruction *I, Instruction *InstWithUses) { +static bool isCommutative(Instruction *I, Value *ValWithUses) { if (auto *Cmp = dyn_cast<CmpInst>(I)) return Cmp->isCommutative(); if (auto *BO = dyn_cast<BinaryOperator>(I)) return BO->isCommutative() || (BO->getOpcode() == Instruction::Sub && - !InstWithUses->hasNUsesOrMore(UsesLimit) && + !ValWithUses->hasNUsesOrMore(UsesLimit) && all_of( - InstWithUses->uses(), + ValWithUses->uses(), [](const Use &U) { // Commutative, if icmp eq/ne sub, 0 CmpPredicate Pred; @@ -552,8 +552,8 @@ static bool isCommutative(Instruction *I, Instruction *InstWithUses) { Flag->isOne()); })) || (BO->getOpcode() == Instruction::FSub && - !InstWithUses->hasNUsesOrMore(UsesLimit) && - all_of(InstWithUses->uses(), [](const Use &U) { + !ValWithUses->hasNUsesOrMore(UsesLimit) && + all_of(ValWithUses->uses(), [](const Use &U) { return match(U.getUser(), m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get()))); })); @@ -570,6 +570,19 @@ static bool isCommutative(Instruction *I, Instruction *InstWithUses) { /// \returns true if the instruction is commutative, false otherwise static bool isCommutative(Instruction *I) { return isCommutative(I, I); } +/// \returns number of operands of \p I, considering commutativity. Returns 2 +/// for commutative instrinsics. +/// \param I The instruction to check for commutativity +static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I) { + if (isa<IntrinsicInst>(I) && isCommutative(I)) { + // IntrinsicInst::isCommutative returns true if swapping the first "two" + // arguments to the intrinsic produces the same result. + constexpr unsigned IntrinsicNumOperands = 2; + return IntrinsicNumOperands; + } + return I->getNumOperands(); +} + template <typename T> static std::optional<unsigned> getInsertExtractIndex(const Value *Inst, unsigned Offset) { @@ -862,6 +875,16 @@ static std::optional<unsigned> getExtractIndex(const Instruction *E) { } namespace llvm { +/// Checks if the provided value does not require scheduling. It does not +/// require scheduling if this is not an instruction or it is an instruction +/// that does not read/write memory and all operands are either not instructions +/// or phi nodes or instructions from different blocks. +static bool areAllOperandsNonInsts(Value *V); +/// Checks if the provided value does not require scheduling. It does not +/// require scheduling if this is not an instruction or it is an instruction +/// that does not read/write memory and all users are phi nodes or instructions +/// from the different blocks. +static bool isUsedOutsideBlock(Value *V); /// Checks if the specified value does not require scheduling. It does not /// require scheduling if all operands and all users do not need to be scheduled /// in the current basic block. @@ -1307,6 +1330,7 @@ public: : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {} static InstructionsState invalid() { return {nullptr, nullptr}; } + /// Checks if the value is a copyable element. bool isCopyableElement(Value *V) const { assert(valid() && "InstructionsState is invalid."); if (!HasCopyables) @@ -1338,6 +1362,8 @@ public: doesNotNeedToBeScheduled(V); // MainOp for copyables always schedulable to correctly identify // non-schedulable copyables. + if (getMainOp() == V) + return false; if (isCopyableElement(V)) { auto IsNonSchedulableCopyableElement = [this](Value *V) { auto *I = dyn_cast<Instruction>(V); @@ -1355,6 +1381,7 @@ public: doesNotNeedToBeScheduled(V); } + /// Checks if the state represents copyable instructions. bool areInstructionsWithCopyableElements() const { assert(valid() && "InstructionsState is invalid."); return HasCopyables; @@ -1886,6 +1913,7 @@ class BoUpSLP { class TreeEntry; class ScheduleEntity; class ScheduleData; + class ScheduleCopyableData; class ScheduleBundle; class ShuffleCostEstimator; class ShuffleInstructionBuilder; @@ -2246,6 +2274,7 @@ public: operator bool() const { return UserTE != nullptr; } }; + friend struct DenseMapInfo<EdgeInfo>; /// A helper class used for scoring candidates for two consecutive lanes. class LookAheadHeuristics { @@ -2384,6 +2413,11 @@ public: if (C1 && C2) return LookAheadHeuristics::ScoreConstants; + // Consider constants and buildvector compatible. + if ((C1 && isa<InsertElementInst>(V2)) || + (C2 && isa<InsertElementInst>(V1))) + return LookAheadHeuristics::ScoreConstants; + // Extracts from consecutive indexes of the same vector better score as // the extracts could be optimized away. Value *EV1; @@ -3010,10 +3044,9 @@ public: assert(S.valid() && "InstructionsState is invalid."); // IntrinsicInst::isCommutative returns true if swapping the first "two" // arguments to the intrinsic produces the same result. - constexpr unsigned IntrinsicNumOperands = 2; Instruction *MainOp = S.getMainOp(); unsigned NumOperands = MainOp->getNumOperands(); - ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands; + ArgSize = ::getNumberOfPotentiallyCommutativeOps(MainOp); OpsVec.resize(ArgSize); unsigned NumLanes = VL.size(); for (OperandDataVec &Ops : OpsVec) @@ -3038,7 +3071,7 @@ public: bool IsInverseOperation = false; if (S.isCopyableElement(VL[Lane])) { // The value is a copyable element. - IsInverseOperation = !isCommutative(MainOp); + IsInverseOperation = !isCommutative(MainOp, VL[Lane]); } else { assert(I && "Expected instruction"); auto [SelectedOp, Ops] = convertTo(I, S); @@ -4332,7 +4365,10 @@ private: } else { // Build a map for gathered scalars to the nodes where they are used. bool AllConstsOrCasts = true; - for (Value *V : VL) + for (Value *V : VL) { + if (S && S.areInstructionsWithCopyableElements() && + S.isCopyableElement(V)) + Last->addCopyableElement(V); if (!isConstant(V)) { auto *I = dyn_cast<CastInst>(V); AllConstsOrCasts &= I && I->getType()->isIntegerTy(); @@ -4340,6 +4376,7 @@ private: !UserTreeIdx.UserTE->isGather()) ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); } + } if (AllConstsOrCasts) CastMaxMinBWSizes = std::make_pair(std::numeric_limits<unsigned>::max(), 1); @@ -4518,8 +4555,6 @@ private: bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, Instruction *Inst2) { assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction."); - if (!isSimple(Inst2)) - return true; // First check if the result is already in the cache. AliasCacheKey Key = std::make_pair(Inst1, Inst2); auto Res = AliasCache.try_emplace(Key); @@ -4528,7 +4563,6 @@ private: bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1)); // Store the result in the cache. Res.first->getSecond() = Aliased; - AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased); return Aliased; } @@ -4587,16 +4621,18 @@ private: /// List of hashes of vector of loads, which are known to be non vectorizable. DenseSet<size_t> ListOfKnonwnNonVectorizableLoads; - /// Represents a scheduling entity, either ScheduleData or ScheduleBundle. - /// ScheduleData used to gather dependecies for a single instructions, while - /// ScheduleBundle represents a batch of instructions, going to be groupped - /// together. + /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData + /// or ScheduleBundle. ScheduleData used to gather dependecies for a single + /// instructions, while ScheduleBundle represents a batch of instructions, + /// going to be groupped together. ScheduleCopyableData models extra user for + /// "copyable" instructions. class ScheduleEntity { friend class ScheduleBundle; friend class ScheduleData; + friend class ScheduleCopyableData; protected: - enum class Kind { ScheduleData, ScheduleBundle }; + enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData }; Kind getKind() const { return K; } ScheduleEntity(Kind K) : K(K) {} @@ -4615,17 +4651,79 @@ private: void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; } int getSchedulingPriority() const { return SchedulingPriority; } bool isReady() const { - if (auto *SD = dyn_cast<ScheduleData>(this)) + if (const auto *SD = dyn_cast<ScheduleData>(this)) return SD->isReady(); + if (const auto *CD = dyn_cast<ScheduleCopyableData>(this)) + return CD->isReady(); return cast<ScheduleBundle>(this)->isReady(); } + /// Returns true if the dependency information has been calculated. + /// Note that depenendency validity can vary between instructions within + /// a single bundle. + bool hasValidDependencies() const { + if (const auto *SD = dyn_cast<ScheduleData>(this)) + return SD->hasValidDependencies(); + if (const auto *CD = dyn_cast<ScheduleCopyableData>(this)) + return CD->hasValidDependencies(); + return cast<ScheduleBundle>(this)->hasValidDependencies(); + } + /// Gets the number of unscheduled dependencies. + int getUnscheduledDeps() const { + if (const auto *SD = dyn_cast<ScheduleData>(this)) + return SD->getUnscheduledDeps(); + if (const auto *CD = dyn_cast<ScheduleCopyableData>(this)) + return CD->getUnscheduledDeps(); + return cast<ScheduleBundle>(this)->unscheduledDepsInBundle(); + } + /// Increments the number of unscheduled dependencies. + int incrementUnscheduledDeps(int Incr) { + if (auto *SD = dyn_cast<ScheduleData>(this)) + return SD->incrementUnscheduledDeps(Incr); + return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr); + } + /// Gets the number of dependencies. + int getDependencies() const { + if (const auto *SD = dyn_cast<ScheduleData>(this)) + return SD->getDependencies(); + return cast<ScheduleCopyableData>(this)->getDependencies(); + } + /// Gets the instruction. + Instruction *getInst() const { + if (const auto *SD = dyn_cast<ScheduleData>(this)) + return SD->getInst(); + return cast<ScheduleCopyableData>(this)->getInst(); + } + /// Gets/sets if the bundle is scheduled. bool isScheduled() const { return IsScheduled; } void setScheduled(bool Scheduled) { IsScheduled = Scheduled; } static bool classof(const ScheduleEntity *) { return true; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump(raw_ostream &OS) const { + if (const auto *SD = dyn_cast<ScheduleData>(this)) + return SD->dump(OS); + if (const auto *CD = dyn_cast<ScheduleCopyableData>(this)) + return CD->dump(OS); + return cast<ScheduleBundle>(this)->dump(OS); + } + + LLVM_DUMP_METHOD void dump() const { + dump(dbgs()); + dbgs() << '\n'; + } +#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) }; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + friend inline raw_ostream &operator<<(raw_ostream &OS, + const BoUpSLP::ScheduleEntity &SE) { + SE.dump(OS); + return OS; + } +#endif + /// Contains all scheduling relevant data for an instruction. /// A ScheduleData either represents a single instruction or a member of an /// instruction bundle (= a group of instructions which is combined into a @@ -4688,10 +4786,18 @@ private: /// Clears all dependency information. void clearDependencies() { - Dependencies = InvalidDeps; - resetUnscheduledDeps(); + clearDirectDependencies(); MemoryDependencies.clear(); ControlDependencies.clear(); + } + + /// Clears all direct dependencies only, except for control and memory + /// dependencies. + /// Required for copyable elements to correctly handle control/memory deps + /// and avoid extra reclaculation of such deps. + void clearDirectDependencies() { + Dependencies = InvalidDeps; + resetUnscheduledDeps(); IsScheduled = false; } @@ -4781,7 +4887,7 @@ private: class ScheduleBundle final : public ScheduleEntity { /// The schedule data for the instructions in the bundle. - SmallVector<ScheduleData *> Bundle; + SmallVector<ScheduleEntity *> Bundle; /// True if this bundle is valid. bool IsValid = true; /// The TreeEntry that this instruction corresponds to. @@ -4797,7 +4903,7 @@ private: /// Verify basic self consistency properties void verify() const { - for (const ScheduleData *SD : Bundle) { + for (const ScheduleEntity *SD : Bundle) { if (SD->hasValidDependencies()) { assert(SD->getUnscheduledDeps() <= SD->getDependencies() && "invariant"); @@ -4817,7 +4923,7 @@ private: int unscheduledDepsInBundle() const { assert(*this && "bundle must not be empty"); int Sum = 0; - for (const ScheduleData *BundleMember : Bundle) { + for (const ScheduleEntity *BundleMember : Bundle) { if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps) return ScheduleData::InvalidDeps; Sum += BundleMember->getUnscheduledDeps(); @@ -4829,7 +4935,7 @@ private: /// Note that depenendency validity can vary between instructions within /// a single bundle. bool hasValidDependencies() const { - return all_of(Bundle, [](const ScheduleData *SD) { + return all_of(Bundle, [](const ScheduleEntity *SD) { return SD->hasValidDependencies(); }); } @@ -4843,10 +4949,10 @@ private: /// Returns the bundle of scheduling data, associated with the current /// instruction. - ArrayRef<ScheduleData *> getBundle() { return Bundle; } - ArrayRef<const ScheduleData *> getBundle() const { return Bundle; } + ArrayRef<ScheduleEntity *> getBundle() { return Bundle; } + ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; } /// Adds an instruction to the bundle. - void add(ScheduleData *SD) { Bundle.push_back(SD); } + void add(ScheduleEntity *SD) { Bundle.push_back(SD); } /// Gets/sets the associated tree entry. void setTreeEntry(TreeEntry *TE) { this->TE = TE; } @@ -4863,8 +4969,11 @@ private: return; } OS << '['; - interleaveComma(Bundle, OS, - [&](const ScheduleData *SD) { OS << *SD->getInst(); }); + interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) { + if (isa<ScheduleCopyableData>(SD)) + OS << "<Copyable>"; + OS << *SD->getInst(); + }); OS << ']'; } @@ -4883,6 +4992,129 @@ private: } #endif + /// Contains all scheduling relevant data for the copyable instruction. + /// It models the virtual instructions, supposed to replace the original + /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0, + /// %1], where %1 = add, then the ScheduleCopyableData models virtual + /// instruction %virt = add %0, 0. + class ScheduleCopyableData final : public ScheduleEntity { + /// The source schedule data for the instruction. + Instruction *Inst = nullptr; + /// The edge information for the instruction. + const EdgeInfo EI; + /// This ScheduleData is in the current scheduling region if this matches + /// the current SchedulingRegionID of BlockScheduling. + int SchedulingRegionID = 0; + /// Bundle, this data is part of. + ScheduleBundle &Bundle; + + public: + ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I, + const EdgeInfo &EI, ScheduleBundle &Bundle) + : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI), + SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {} + static bool classof(const ScheduleEntity *Entity) { + return Entity->getKind() == Kind::ScheduleCopyableData; + } + + /// Verify basic self consistency properties + void verify() { + if (hasValidDependencies()) { + assert(UnscheduledDeps <= Dependencies && "invariant"); + } else { + assert(UnscheduledDeps == Dependencies && "invariant"); + } + + if (IsScheduled) { + assert(hasValidDependencies() && UnscheduledDeps == 0 && + "unexpected scheduled state"); + } + } + + /// Returns true if the dependency information has been calculated. + /// Note that depenendency validity can vary between instructions within + /// a single bundle. + bool hasValidDependencies() const { + return Dependencies != ScheduleData::InvalidDeps; + } + + /// Returns true if it is ready for scheduling, i.e. it has no more + /// unscheduled depending instructions/bundles. + bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; } + + /// Modifies the number of unscheduled dependencies for this instruction, + /// and returns the number of remaining dependencies for the containing + /// bundle. + int incrementUnscheduledDeps(int Incr) { + assert(hasValidDependencies() && + "increment of unscheduled deps would be meaningless"); + UnscheduledDeps += Incr; + assert(UnscheduledDeps >= 0 && "invariant"); + return UnscheduledDeps; + } + + /// Sets the number of unscheduled dependencies to the number of + /// dependencies. + void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; } + + /// Gets the number of unscheduled dependencies. + int getUnscheduledDeps() const { return UnscheduledDeps; } + /// Gets the number of dependencies. + int getDependencies() const { return Dependencies; } + /// Initializes the number of dependencies. + void initDependencies() { Dependencies = 0; } + /// Increments the number of dependencies. + void incDependencies() { Dependencies++; } + + /// Gets scheduling region ID. + int getSchedulingRegionID() const { return SchedulingRegionID; } + + /// Gets the instruction. + Instruction *getInst() const { return Inst; } + + /// Clears all dependency information. + void clearDependencies() { + Dependencies = ScheduleData::InvalidDeps; + UnscheduledDeps = ScheduleData::InvalidDeps; + IsScheduled = false; + } + + /// Gets the edge information. + const EdgeInfo &getEdgeInfo() const { return EI; } + + /// Gets the bundle. + ScheduleBundle &getBundle() { return Bundle; } + const ScheduleBundle &getBundle() const { return Bundle; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); } + + LLVM_DUMP_METHOD void dump() const { + dump(dbgs()); + dbgs() << '\n'; + } +#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + + private: + /// true, if it has valid dependency information. These nodes always have + /// only single dependency. + int Dependencies = ScheduleData::InvalidDeps; + + /// The number of dependencies minus the number of dependencies of scheduled + /// instructions. As soon as this is zero, the instruction/bundle gets ready + /// for scheduling. + /// Note that this is negative as long as Dependencies is not calculated. + int UnscheduledDeps = ScheduleData::InvalidDeps; + }; + +#ifndef NDEBUG + friend inline raw_ostream & + operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) { + SD.dump(OS); + return OS; + } +#endif + friend struct GraphTraits<BoUpSLP *>; friend struct DOTGraphTraits<BoUpSLP *>; @@ -4909,6 +5141,10 @@ private: void clear() { ScheduledBundles.clear(); ScheduledBundlesList.clear(); + ScheduleCopyableDataMap.clear(); + ScheduleCopyableDataMapByInst.clear(); + ScheduleCopyableDataMapByInstUser.clear(); + ScheduleCopyableDataMapByUsers.clear(); ReadyInsts.clear(); ScheduleStart = nullptr; ScheduleEnd = nullptr; @@ -4935,7 +5171,7 @@ private: // Avoid lookup if can't possibly be in map. return nullptr; ScheduleData *SD = ScheduleDataMap.lookup(I); - if (SD && isInSchedulingRegion(SD)) + if (SD && isInSchedulingRegion(*SD)) return SD; return nullptr; } @@ -4944,6 +5180,201 @@ private: return getScheduleData(dyn_cast<Instruction>(V)); } + /// Returns the ScheduleCopyableData for the given edge (user tree entry and + /// operand number) and value. + ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI, + const Value *V) const { + if (ScheduleCopyableDataMap.empty()) + return nullptr; + auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V)); + if (It == ScheduleCopyableDataMap.end()) + return nullptr; + ScheduleCopyableData *SD = It->getSecond().get(); + if (!isInSchedulingRegion(*SD)) + return nullptr; + return SD; + } + + /// Returns the ScheduleCopyableData for the given user \p User, operand + /// number and operand \p V. + SmallVector<ScheduleCopyableData *> + getScheduleCopyableData(const Value *User, unsigned OperandIdx, + const Value *V) { + if (ScheduleCopyableDataMapByInstUser.empty()) + return {}; + const auto It = ScheduleCopyableDataMapByInstUser.find( + std::make_pair(std::make_pair(User, OperandIdx), V)); + if (It == ScheduleCopyableDataMapByInstUser.end()) + return {}; + SmallVector<ScheduleCopyableData *> Res; + for (ScheduleCopyableData *SD : It->getSecond()) { + if (isInSchedulingRegion(*SD)) + Res.push_back(SD); + } + return Res; + } + + /// Returns true if all operands of the given instruction \p User are + /// replaced by copyable data. + /// \param User The user instruction. + /// \param Op The operand, which might be replaced by the copyable data. + /// \param SLP The SLP tree. + /// \param NumOps The number of operands used. If the instruction uses the + /// same operand several times, check for the first use, then the second, + /// etc. + bool areAllOperandsReplacedByCopyableData(Instruction *User, + Instruction *Op, BoUpSLP &SLP, + unsigned NumOps) const { + assert(NumOps > 0 && "No operands"); + if (ScheduleCopyableDataMap.empty()) + return false; + SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount; + SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount; + for (const Use &U : User->operands()) { + if (U.get() != Op) + continue; + ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User); + if (Entries.empty()) + return false; + // Check all tree entries, if they have operands replaced by copyable + // data. + for (TreeEntry *TE : Entries) { + // Check if the user is commutative. + // The commutatives are handled later, as their oeprands can be + // reordered. + // Same applies even for non-commutative cmps, because we can invert + // their predicate potentially and, thus, reorder the operands. + bool IsCommutativeUser = + ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User); + EdgeInfo EI(TE, U.getOperandNo()); + if (!IsCommutativeUser && !isa<CmpInst>(User)) { + unsigned &OpCnt = + OrderedEntriesCount.try_emplace(TE, 0).first->getSecond(); + if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps) + return false; + // Found copyable operand - continue. + ++OpCnt; + continue; + } + ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0) + .first->getSecond(); + } + } + // Check the commutative/cmp entries. + if (!PotentiallyReorderedEntriesCount.empty()) { + for (auto &P : PotentiallyReorderedEntriesCount) { + auto *It = find(P.first->Scalars, User); + assert(It != P.first->Scalars.end() && + "User is not in the tree entry"); + int Lane = std::distance(P.first->Scalars.begin(), It); + assert(Lane >= 0 && "Lane is not found"); + if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty()) + Lane = P.first->ReorderIndices[Lane]; + assert(Lane < static_cast<int>(P.first->Scalars.size()) && + "Couldn't find extract lane"); + SmallVector<unsigned> OpIndices; + for (unsigned OpIdx : + seq<unsigned>(::getNumberOfPotentiallyCommutativeOps( + P.first->getMainOp()))) { + if (P.first->getOperand(OpIdx)[Lane] == Op && + getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op)) + --P.getSecond(); + } + } + return all_of(PotentiallyReorderedEntriesCount, + [&](const std::pair<const TreeEntry *, unsigned> &P) { + return P.second == NumOps - 1; + }); + } + return true; + } + + SmallVector<ScheduleCopyableData *> + getScheduleCopyableData(const Instruction *I) const { + if (ScheduleCopyableDataMapByInst.empty()) + return {}; + const auto It = ScheduleCopyableDataMapByInst.find(I); + if (It == ScheduleCopyableDataMapByInst.end()) + return {}; + SmallVector<ScheduleCopyableData *> Res; + for (ScheduleCopyableData *SD : It->getSecond()) { + if (isInSchedulingRegion(*SD)) + Res.push_back(SD); + } + return Res; + } + + SmallVector<ScheduleCopyableData *> + getScheduleCopyableDataUsers(const Instruction *User) const { + if (ScheduleCopyableDataMapByUsers.empty()) + return {}; + const auto It = ScheduleCopyableDataMapByUsers.find(User); + if (It == ScheduleCopyableDataMapByUsers.end()) + return {}; + SmallVector<ScheduleCopyableData *> Res; + for (ScheduleCopyableData *SD : It->getSecond()) { + if (isInSchedulingRegion(*SD)) + Res.push_back(SD); + } + return Res; + } + + ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI, + Instruction *I, + int SchedulingRegionID, + ScheduleBundle &Bundle) { + assert(!getScheduleCopyableData(EI, I) && "already in the map"); + ScheduleCopyableData *CD = + ScheduleCopyableDataMap + .try_emplace(std::make_pair(EI, I), + std::make_unique<ScheduleCopyableData>( + SchedulingRegionID, I, EI, Bundle)) + .first->getSecond() + .get(); + ScheduleCopyableDataMapByInst[I].push_back(CD); + if (EI.UserTE) { + ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx); + const auto *It = find(Op, I); + assert(It != Op.end() && "Lane not set"); + SmallPtrSet<Instruction *, 4> Visited; + do { + int Lane = std::distance(Op.begin(), It); + assert(Lane >= 0 && "Lane not set"); + if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) && + !EI.UserTE->ReorderIndices.empty()) + Lane = EI.UserTE->ReorderIndices[Lane]; + assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) && + "Couldn't find extract lane"); + auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]); + if (!Visited.insert(In).second) { + It = find(make_range(std::next(It), Op.end()), I); + continue; + } + ScheduleCopyableDataMapByInstUser + .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I)) + .first->getSecond() + .push_back(CD); + ScheduleCopyableDataMapByUsers.try_emplace(I) + .first->getSecond() + .insert(CD); + // Remove extra deps for users, becoming non-immediate users of the + // instruction. It may happen, if the chain of same copyable elements + // appears in the tree. + if (In == I) { + EdgeInfo UserEI = EI.UserTE->UserTreeIndex; + if (ScheduleCopyableData *UserCD = + getScheduleCopyableData(UserEI, In)) + ScheduleCopyableDataMapByUsers[I].remove(UserCD); + } + It = find(make_range(std::next(It), Op.end()), I); + } while (It != Op.end()); + } else { + ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert( + CD); + } + return *CD; + } + ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const { auto *I = dyn_cast<Instruction>(V); if (!I) @@ -4954,34 +5385,44 @@ private: return It->getSecond(); } - bool isInSchedulingRegion(ScheduleData *SD) const { - return SD->getSchedulingRegionID() == SchedulingRegionID; - } - - bool isInSchedulingRegion(const ScheduleBundle &Bundle) const { - return all_of(Bundle.getBundle(), [&](const ScheduleData *BundleMember) { - return BundleMember->getSchedulingRegionID() == SchedulingRegionID; - }); + /// Returns true if the entity is in the scheduling region. + bool isInSchedulingRegion(const ScheduleEntity &SD) const { + if (const auto *Data = dyn_cast<ScheduleData>(&SD)) + return Data->getSchedulingRegionID() == SchedulingRegionID; + if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD)) + return CD->getSchedulingRegionID() == SchedulingRegionID; + return all_of(cast<ScheduleBundle>(SD).getBundle(), + [&](const ScheduleEntity *BundleMember) { + return isInSchedulingRegion(*BundleMember); + }); } /// Marks an instruction as scheduled and puts all dependent ready /// instructions into the ready-list. template <typename ReadyListType> - void schedule(ScheduleEntity *Data, ReadyListType &ReadyList) { - auto ProcessBundleMember = [&](ScheduleData *BundleMember, - ScheduleBundle *Bundle) { + void schedule(const BoUpSLP &R, const InstructionsState &S, + const EdgeInfo &EI, ScheduleEntity *Data, + ReadyListType &ReadyList) { + auto ProcessBundleMember = [&](ScheduleEntity *BundleMember, + ArrayRef<ScheduleBundle *> Bundles) { // Handle the def-use chain dependencies. // Decrement the unscheduled counter and insert to ready list if ready. - auto DecrUnsched = [&](ScheduleData *Data, bool IsControl = false) { + auto DecrUnsched = [&](auto *Data, bool IsControl = false) { if ((IsControl || Data->hasValidDependencies()) && Data->incrementUnscheduledDeps(-1) == 0) { // There are no more unscheduled dependencies after // decrementing, so we can put the dependent instruction // into the ready list. - if (ArrayRef<ScheduleBundle *> Bundles = - getScheduleBundles(Data->getInst()); - !Bundles.empty()) { + SmallVector<ScheduleBundle *, 1> CopyableBundle; + ArrayRef<ScheduleBundle *> Bundles; + if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) { + CopyableBundle.push_back(&CD->getBundle()); + Bundles = CopyableBundle; + } else { + Bundles = getScheduleBundles(Data->getInst()); + } + if (!Bundles.empty()) { for (ScheduleBundle *Bundle : Bundles) { if (Bundle->unscheduledDepsInBundle() == 0) { assert(!Bundle->isScheduled() && @@ -4995,12 +5436,23 @@ private: } assert(!Data->isScheduled() && "already scheduled bundle gets ready"); + assert(!isa<ScheduleCopyableData>(Data) && + "Expected non-copyable data"); ReadyList.insert(Data); LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n"); } }; - auto DecrUnschedForInst = [&](Instruction *I) { + auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx, + Instruction *I) { + if (!ScheduleCopyableDataMap.empty()) { + SmallVector<ScheduleCopyableData *> CopyableData = + getScheduleCopyableData(User, OpIdx, I); + for (ScheduleCopyableData *CD : CopyableData) + DecrUnsched(CD, /*IsControl=*/false); + if (!CopyableData.empty()) + return; + } if (ScheduleData *OpSD = getScheduleData(I)) DecrUnsched(OpSD, /*IsControl=*/false); }; @@ -5008,45 +5460,101 @@ private: // If BundleMember is a vector bundle, its operands may have been // reordered during buildTree(). We therefore need to get its operands // through the TreeEntry. - if (Bundle) { - // Need to search for the lane since the tree entry can be reordered. + if (!Bundles.empty()) { auto *In = BundleMember->getInst(); - int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(), - find(Bundle->getTreeEntry()->Scalars, In)); - assert(Lane >= 0 && "Lane not set"); - - // Since vectorization tree is being built recursively this assertion - // ensures that the tree entry has all operands set before reaching - // this code. Couple of exceptions known at the moment are extracts - // where their second (immediate) operand is not added. Since - // immediates do not affect scheduler behavior this is considered - // okay. - assert(In && - (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) || - In->getNumOperands() == - Bundle->getTreeEntry()->getNumOperands()) && - "Missed TreeEntry operands?"); - - for (unsigned OpIdx : - seq<unsigned>(Bundle->getTreeEntry()->getNumOperands())) - if (auto *I = dyn_cast<Instruction>( - Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) { - LLVM_DEBUG(dbgs() - << "SLP: check for readiness (def): " << *I << "\n"); - DecrUnschedForInst(I); + // Count uses of each instruction operand. + SmallDenseMap<const Instruction *, unsigned> OperandsUses; + unsigned TotalOpCount = 0; + if (isa<ScheduleCopyableData>(BundleMember)) { + // Copyable data is used only once (uses itself). + TotalOpCount = OperandsUses[In] = 1; + } else { + for (const Use &U : In->operands()) { + if (auto *I = dyn_cast<Instruction>(U.get())) { + auto Res = OperandsUses.try_emplace(I, 0); + ++Res.first->getSecond(); + ++TotalOpCount; + } + } + } + // Decrement the unscheduled counter and insert to ready list if + // ready. + auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE, + unsigned OpIdx) { + if (!ScheduleCopyableDataMap.empty()) { + const EdgeInfo EI = {UserTE, OpIdx}; + if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) { + DecrUnsched(CD, /*IsControl=*/false); + return; + } + } + auto It = OperandsUses.find(I); + assert(It != OperandsUses.end() && "Operand not found"); + if (It->second > 0) { + --It->getSecond(); + assert(TotalOpCount > 0 && "No more operands to decrement"); + --TotalOpCount; + if (ScheduleData *OpSD = getScheduleData(I)) + DecrUnsched(OpSD, /*IsControl=*/false); } + }; + + for (ScheduleBundle *Bundle : Bundles) { + if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0) + break; + // Need to search for the lane since the tree entry can be + // reordered. + int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(), + find(Bundle->getTreeEntry()->Scalars, In)); + assert(Lane >= 0 && "Lane not set"); + if (isa<StoreInst>(In) && + !Bundle->getTreeEntry()->ReorderIndices.empty()) + Lane = Bundle->getTreeEntry()->ReorderIndices[Lane]; + assert(Lane < static_cast<int>( + Bundle->getTreeEntry()->Scalars.size()) && + "Couldn't find extract lane"); + + // Since vectorization tree is being built recursively this + // assertion ensures that the tree entry has all operands set before + // reaching this code. Couple of exceptions known at the moment are + // extracts where their second (immediate) operand is not added. + // Since immediates do not affect scheduler behavior this is + // considered okay. + assert(In && + (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) || + In->getNumOperands() == + Bundle->getTreeEntry()->getNumOperands() || + Bundle->getTreeEntry()->isCopyableElement(In)) && + "Missed TreeEntry operands?"); + + for (unsigned OpIdx : + seq<unsigned>(Bundle->getTreeEntry()->getNumOperands())) + if (auto *I = dyn_cast<Instruction>( + Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) { + LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I + << "\n"); + DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx); + } + } } else { // If BundleMember is a stand-alone instruction, no operand reordering // has taken place, so we directly access its operands. - for (Use &U : BundleMember->getInst()->operands()) + for (Use &U : BundleMember->getInst()->operands()) { if (auto *I = dyn_cast<Instruction>(U.get())) { LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I << "\n"); - DecrUnschedForInst(I); + DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I); } + } } // Handle the memory dependencies. - for (ScheduleData *MemoryDep : BundleMember->getMemoryDependencies()) { + auto *SD = dyn_cast<ScheduleData>(BundleMember); + if (!SD) + return; + SmallPtrSet<const ScheduleData *, 4> VisitedMemory; + for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) { + if (!VisitedMemory.insert(MemoryDep).second) + continue; // There are no more unscheduled dependencies after decrementing, // so we can put the dependent instruction into the ready list. LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): " @@ -5054,7 +5562,10 @@ private: DecrUnsched(MemoryDep); } // Handle the control dependencies. - for (ScheduleData *Dep : BundleMember->getControlDependencies()) { + SmallPtrSet<const ScheduleData *, 4> VisitedControl; + for (ScheduleData *Dep : SD->getControlDependencies()) { + if (!VisitedControl.insert(Dep).second) + continue; // There are no more unscheduled dependencies after decrementing, // so we can put the dependent instruction into the ready list. LLVM_DEBUG(dbgs() @@ -5065,23 +5576,29 @@ private: if (auto *SD = dyn_cast<ScheduleData>(Data)) { SD->setScheduled(/*Scheduled=*/true); LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); - ProcessBundleMember(SD, nullptr); + ProcessBundleMember(SD, {}); } else { ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data); Bundle.setScheduled(/*Scheduled=*/true); LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n"); - auto AreAllBundlesScheduled = [&](const ScheduleData *SD) { - ArrayRef<ScheduleBundle *> SDBundles = - getScheduleBundles(SD->getInst()); - return !SDBundles.empty() && - all_of(SDBundles, [&](const ScheduleBundle *SDBundle) { - return SDBundle->isScheduled(); - }); - }; - for (ScheduleData *SD : Bundle.getBundle()) { - if (AreAllBundlesScheduled(SD)) { + auto AreAllBundlesScheduled = + [&](const ScheduleEntity *SD, + ArrayRef<ScheduleBundle *> SDBundles) { + if (isa<ScheduleCopyableData>(SD)) + return true; + return !SDBundles.empty() && + all_of(SDBundles, [&](const ScheduleBundle *SDBundle) { + return SDBundle->isScheduled(); + }); + }; + for (ScheduleEntity *SD : Bundle.getBundle()) { + ArrayRef<ScheduleBundle *> SDBundles; + if (!isa<ScheduleCopyableData>(SD)) + SDBundles = getScheduleBundles(SD->getInst()); + if (AreAllBundlesScheduled(SD, SDBundles)) { SD->setScheduled(/*Scheduled=*/true); - ProcessBundleMember(SD, &Bundle); + ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle + : SDBundles); } } } @@ -5109,7 +5626,7 @@ private: auto *SD = getScheduleData(I); if (!SD) continue; - assert(isInSchedulingRegion(SD) && + assert(isInSchedulingRegion(*SD) && "primary schedule data not in window?"); SD->verify(); } @@ -5150,8 +5667,11 @@ private: /// Build a bundle from the ScheduleData nodes corresponding to the /// scalar instruction for each lane. + /// \param VL The list of scalar instructions. + /// \param S The state of the instructions. + /// \param EI The edge in the SLP graph or the user node/operand number. ScheduleBundle &buildBundle(ArrayRef<Value *> VL, - const InstructionsState &S); + const InstructionsState &S, const EdgeInfo &EI); /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are @@ -5160,7 +5680,7 @@ private: /// std::nullopt if \p VL is allowed to be scheduled. std::optional<ScheduleBundle *> tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, - const InstructionsState &S); + const InstructionsState &S, const EdgeInfo &EI); /// Allocates schedule data chunk. ScheduleData *allocateScheduleDataChunks(); @@ -5178,7 +5698,8 @@ private: /// Updates the dependency information of a bundle and of all instructions/ /// bundles which depend on the original bundle. void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList, - BoUpSLP *SLP); + BoUpSLP *SLP, + ArrayRef<ScheduleData *> ControlDeps = {}); /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); @@ -5200,6 +5721,48 @@ private: /// ScheduleData structures are recycled. SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap; + /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand + /// number) and the operand instruction, represented as copyable element. + SmallDenseMap<std::pair<EdgeInfo, const Value *>, + std::unique_ptr<ScheduleCopyableData>> + ScheduleCopyableDataMap; + + /// Represents mapping between instruction and all related + /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable + /// element). The SLP tree may contain several representations of the same + /// instruction. + SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>> + ScheduleCopyableDataMapByInst; + + /// Represents mapping between user value and operand number, the operand + /// value and all related ScheduleCopyableData. The relation is 1:n, because + /// the same user may refernce the same operand in different tree entries + /// and the operand may be modelled by the different copyable data element. + SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>, + SmallVector<ScheduleCopyableData *>> + ScheduleCopyableDataMapByInstUser; + + /// Represents mapping between instruction and all related + /// ScheduleCopyableData. It represents the mapping between the actual + /// instruction and the last copyable data element in the chain. E.g., if + /// the graph models the following instructions: + /// %0 = non-add instruction ... + /// ... + /// %4 = add %3, 1 + /// %5 = add %4, 1 + /// %6 = insertelement poison, %0, 0 + /// %7 = insertelement %6, %5, 1 + /// And the graph is modeled as: + /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ] + /// -> [1, 0] -> [%1, 0] + /// + /// this map will map %0 only to the copyable element <1>, which is the last + /// user (direct user of the actual instruction). <0> uses <1>, so <1> will + /// keep the map to <0>, not the %0. + SmallDenseMap<const Instruction *, + SmallSetVector<ScheduleCopyableData *, 4>> + ScheduleCopyableDataMapByUsers; + /// Attaches ScheduleBundle to Instruction. SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>> ScheduledBundles; @@ -5246,7 +5809,7 @@ private: /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. - void scheduleBlock(BlockScheduling *BS); + void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS); /// List of users to ignore during scheduling and that don't need extracting. const SmallDenseSet<Value *> *UserIgnoreList = nullptr; @@ -5319,6 +5882,30 @@ private: } // end namespace slpvectorizer +template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> { + using FirstInfo = DenseMapInfo<BoUpSLP::TreeEntry *>; + using SecondInfo = DenseMapInfo<unsigned>; + static BoUpSLP::EdgeInfo getEmptyKey() { + return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(), + SecondInfo::getEmptyKey()); + } + + static BoUpSLP::EdgeInfo getTombstoneKey() { + return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(), + SecondInfo::getTombstoneKey()); + } + + static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) { + return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE), + SecondInfo::getHashValue(Val.EdgeIdx)); + } + + static bool isEqual(const BoUpSLP::EdgeInfo &LHS, + const BoUpSLP::EdgeInfo &RHS) { + return LHS == RHS; + } +}; + template <> struct GraphTraits<BoUpSLP *> { using TreeEntry = BoUpSLP::TreeEntry; @@ -7195,12 +7782,45 @@ bool BoUpSLP::isProfitableToReorder() const { // Check if the tree has only single store and single (unordered) load node, // other nodes are phis or geps/binops, combined with phis, and/or single // gather load node - bool HasPhis = false; if (VectorizableTree.front()->hasState() && VectorizableTree.front()->getOpcode() == Instruction::PHI && VectorizableTree.front()->Scalars.size() == TinyVF && VectorizableTree.front()->getNumOperands() > PhiOpsLimit) return false; + // Single node, which require reorder - skip. + if (VectorizableTree.front()->hasState() && + VectorizableTree.front()->getOpcode() == Instruction::Store && + VectorizableTree.front()->ReorderIndices.empty()) { + const unsigned ReorderedSplitsCnt = + count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { + return TE->State == TreeEntry::SplitVectorize && + !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE && + TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize && + ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp()); + }); + if (ReorderedSplitsCnt <= 1 && + static_cast<unsigned>(count_if( + VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { + return ((!TE->isGather() && + (TE->ReorderIndices.empty() || + (TE->UserTreeIndex.UserTE && + TE->UserTreeIndex.UserTE->State == + TreeEntry::Vectorize && + !TE->UserTreeIndex.UserTE->ReuseShuffleIndices + .empty()))) || + (TE->isGather() && TE->ReorderIndices.empty() && + (!TE->hasState() || TE->isAltShuffle() || + TE->getOpcode() == Instruction::Load || + TE->getOpcode() == Instruction::ZExt || + TE->getOpcode() == Instruction::SExt))) && + (VectorizableTree.front()->getVectorFactor() > TinyVF || + !TE->isGather() || none_of(TE->Scalars, [&](Value *V) { + return !isConstant(V) && isVectorized(V); + })); + })) >= VectorizableTree.size() - ReorderedSplitsCnt) + return false; + } + bool HasPhis = false; bool HasLoad = true; unsigned GatherLoads = 0; for (const std::unique_ptr<TreeEntry> &TE : @@ -9772,7 +10392,8 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL, }))) { if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() && - all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) { + (S.areInstructionsWithCopyableElements() || + all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) { // Find the number of elements, which forms full vectors. unsigned PWSz = getFullVectorNumberOfElements( TTI, UniqueValues.front()->getType(), UniqueValues.size()); @@ -9789,8 +10410,8 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL, PaddedUniqueValues.append( PWSz - UniqueValues.size(), PoisonValue::get(UniqueValues.front()->getType())); - // Check that extended with poisons operations are still valid for - // vectorization (div/rem are not allowed). + // Check that extended with poisons/copyable operations are still valid + // for vectorization (div/rem are not allowed). if (!S.areInstructionsWithCopyableElements() && !getSameOpcode(PaddedUniqueValues, TLI).valid()) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); @@ -9952,35 +10573,41 @@ class InstructionsCompatibilityAnalysis { unsigned MainOpcode = 0; Instruction *MainOp = nullptr; + /// Checks if the opcode is supported as the main opcode for copyable + /// elements. + static bool isSupportedOpcode(const unsigned Opcode) { + return Opcode == Instruction::Add || Opcode == Instruction::LShr; + } + /// Identifies the best candidate value, which represents main opcode /// operation. /// Currently the best candidate is the Add instruction with the parent /// block with the highest DFS incoming number (block, that dominates other). - void findAndSetMainInstruction(ArrayRef<Value *> VL) { + void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) { BasicBlock *Parent = nullptr; // Checks if the instruction has supported opcode. - auto IsSupportedOpcode = [](Instruction *I) { - return I && I->getOpcode() == Instruction::Add; + auto IsSupportedInstruction = [&](Instruction *I) { + return I && isSupportedOpcode(I->getOpcode()) && + (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I)); }; + // Exclude operands instructions immediately to improve compile time, it + // will be unable to schedule anyway. SmallDenseSet<Value *, 8> Operands; + SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates; for (Value *V : VL) { auto *I = dyn_cast<Instruction>(V); if (!I) continue; if (!DT.isReachableFromEntry(I->getParent())) continue; - if (!MainOp) { - MainOp = I; + if (Candidates.empty()) { + Candidates.try_emplace(I->getOpcode()).first->second.push_back(I); Parent = I->getParent(); Operands.insert(I->op_begin(), I->op_end()); continue; } if (Parent == I->getParent()) { - if (!IsSupportedOpcode(MainOp)) - MainOp = I; - if (MainOp->getOpcode() == I->getOpcode() && - doesNotNeedToBeScheduled(MainOp) && !doesNotNeedToBeScheduled(I)) - MainOp = I; + Candidates.try_emplace(I->getOpcode()).first->second.push_back(I); Operands.insert(I->op_begin(), I->op_end()); continue; } @@ -9992,24 +10619,35 @@ class InstructionsCompatibilityAnalysis { (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && "Different nodes should have different DFS numbers"); if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) { - MainOp = I; + Candidates.clear(); + Candidates.try_emplace(I->getOpcode()).first->second.push_back(I); Parent = I->getParent(); Operands.clear(); Operands.insert(I->op_begin(), I->op_end()); } } - if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) { - MainOp = nullptr; - return; + unsigned BestOpcodeNum = 0; + MainOp = nullptr; + for (const auto &P : Candidates) { + if (P.second.size() < BestOpcodeNum) + continue; + for (Instruction *I : P.second) { + if (IsSupportedInstruction(I) && !Operands.contains(I)) { + MainOp = I; + BestOpcodeNum = P.second.size(); + break; + } + } } - MainOpcode = MainOp->getOpcode(); + if (MainOp) + MainOpcode = MainOp->getOpcode(); } /// Returns the idempotent value for the \p MainOp with the detected \p /// MainOpcode. For Add, returns 0. For Or, it should choose between false and /// the operand itself, since V or V == V. Value *selectBestIdempotentValue() const { - assert(MainOpcode == Instruction::Add && "Unsupported opcode"); + assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode"); return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(), !MainOp->isCommutative()); } @@ -10022,13 +10660,8 @@ class InstructionsCompatibilityAnalysis { return {V, V}; if (!S.isCopyableElement(V)) return convertTo(cast<Instruction>(V), S).second; - switch (MainOpcode) { - case Instruction::Add: - return {V, selectBestIdempotentValue()}; - default: - break; - } - llvm_unreachable("Unsupported opcode"); + assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode"); + return {V, selectBestIdempotentValue()}; } /// Builds operands for the original instructions. @@ -10202,16 +10835,10 @@ public: return S; if (!VectorizeCopyableElements || !TryCopyableElementsVectorization) return S; - findAndSetMainInstruction(VL); + findAndSetMainInstruction(VL, R); if (!MainOp) return InstructionsState::invalid(); S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true); - // TODO: Remove this check once support for schulable copyables is landed. - if (any_of(VL, [&](Value *V) { - return S.isCopyableElement(V) && !S.isNonSchedulable(V); - })) - return InstructionsState::invalid(); - if (!WithProfitabilityCheck) return S; // Check if it is profitable to vectorize the instruction. @@ -10247,6 +10874,21 @@ public: } if (!Res) return InstructionsState::invalid(); + constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput; + InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind); + InstructionCost VectorCost; + FixedVectorType *VecTy = + getWidenedType(S.getMainOp()->getType(), VL.size()); + switch (MainOpcode) { + case Instruction::Add: + case Instruction::LShr: + VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind); + break; + default: + llvm_unreachable("Unexpected instruction."); + } + if (VectorCost > ScalarCost) + return InstructionsState::invalid(); return S; } assert(Operands.size() == 2 && "Unexpected number of operands!"); @@ -10731,7 +11373,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth, SetVector<Value *> UniqueValues(llvm::from_range, VL); std::optional<ScheduleBundle *> BundlePtr = - BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S); + BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx); #ifdef EXPENSIVE_CHECKS // Make sure we didn't break any internal invariants BS.verify(); @@ -11991,6 +12633,8 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) { } } +/// Check if we can convert fadd/fsub sequence to FMAD. +/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise. static InstructionCost canConvertToFMA(ArrayRef<Value *> VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, @@ -12010,7 +12654,8 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL, auto *I = dyn_cast<Instruction>(V); if (!I) continue; - // TODO: support for copyable elements. + if (S.isCopyableElement(I)) + continue; Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I); if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI) continue; @@ -12028,6 +12673,7 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL, InstructionsState OpS = getSameOpcode(Operands.front(), TLI); if (!OpS.valid()) return InstructionCost::getInvalid(); + if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul) return InstructionCost::getInvalid(); if (!CheckForContractable(Operands.front())) @@ -12042,15 +12688,19 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL, auto *I = dyn_cast<Instruction>(V); if (!I) continue; - if (auto *FPCI = dyn_cast<FPMathOperator>(I)) - FMF &= FPCI->getFastMathFlags(); + if (!S.isCopyableElement(I)) + if (auto *FPCI = dyn_cast<FPMathOperator>(I)) + FMF &= FPCI->getFastMathFlags(); FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind); } unsigned NumOps = 0; for (auto [V, Op] : zip(VL, Operands.front())) { + if (S.isCopyableElement(V)) + continue; auto *I = dyn_cast<Instruction>(Op); - if (!I || !I->hasOneUse()) { - FMACost += TTI.getInstructionCost(cast<Instruction>(V), CostKind); + if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) { + if (auto *OpI = dyn_cast<Instruction>(V)) + FMACost += TTI.getInstructionCost(OpI, CostKind); if (I) FMACost += TTI.getInstructionCost(I, CostKind); continue; @@ -14687,6 +15337,31 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { }))))) return true; + // If the tree contains only buildvector, 2 non-buildvectors (with root user + // tree node) and other buildvectors, we can skip it. + if (!ForReduction && SLPCostThreshold.getNumOccurrences() && + VectorizableTree.front()->State == TreeEntry::SplitVectorize && + VectorizableTree.size() >= Limit && + count_if(ArrayRef(VectorizableTree).drop_front(), + [&](const std::unique_ptr<TreeEntry> &TE) { + return !TE->isGather() && TE->UserTreeIndex.UserTE && + TE->UserTreeIndex.UserTE->Idx == 0; + }) == 2) + return true; + + // If the tree contains only vectorization of the phi node from the + // buildvector - skip it. + if (!ForReduction && SLPCostThreshold.getNumOccurrences() && + VectorizableTree.size() > 2 && + VectorizableTree.front()->State == TreeEntry::Vectorize && + VectorizableTree.front()->getOpcode() == Instruction::InsertElement && + VectorizableTree[1]->State == TreeEntry::Vectorize && + VectorizableTree[1]->getOpcode() == Instruction::PHI && + all_of( + ArrayRef(VectorizableTree).drop_front(2), + [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); })) + return true; + // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. if (VectorizableTree.size() >= MinTreeSize) @@ -19234,7 +19909,7 @@ Value *BoUpSLP::vectorizeTree( EntryToLastInstruction.clear(); // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) - scheduleBlock(BSIter.second.get()); + scheduleBlock(*this, BSIter.second.get()); // Cache last instructions for the nodes to avoid side effects, which may // appear during vectorization, like extra uses, etc. for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { @@ -20041,24 +20716,29 @@ void BoUpSLP::optimizeGatherSequence() { GatherShuffleExtractSeq.clear(); } -BoUpSLP::ScheduleBundle & -BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL, - const InstructionsState &S) { +BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle( + ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) { auto &BundlePtr = ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>()); for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (S.isNonSchedulable(V)) continue; - if (S.isCopyableElement(V)) + auto *I = cast<Instruction>(V); + if (S.isCopyableElement(V)) { + // Add a copyable element model. + ScheduleCopyableData &SD = + addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr); + // Group the instructions to a bundle. + BundlePtr->add(&SD); continue; + } ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member " "(maybe not in same basic block)"); // Group the instructions to a bundle. BundlePtr->add(BundleMember); - ScheduledBundles.try_emplace(cast<Instruction>(V)) - .first->getSecond() - .push_back(BundlePtr.get()); + ScheduledBundles.try_emplace(I).first->getSecond().push_back( + BundlePtr.get()); } assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle"); return *BundlePtr; @@ -20068,7 +20748,8 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL, // and schedules instructions until the bundle gets ready. std::optional<BoUpSLP::ScheduleBundle *> BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, - const InstructionsState &S) { + const InstructionsState &S, + const EdgeInfo &EI) { // No need to schedule PHIs, insertelement, extractelement and extractvalue // instructions. bool HasCopyables = S.areInstructionsWithCopyableElements(); @@ -20078,33 +20759,83 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); })) return nullptr; - // TODO Remove once full support for copyables is landed. - assert(all_of(VL, - [&](Value *V) { - return !S.isCopyableElement(V) || S.isNonSchedulable(V); - }) && - "Copyable elements should not be schedulable"); // Initialize the instruction bundle. Instruction *OldScheduleEnd = ScheduleEnd; LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n"); auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) { + // Clear deps or recalculate the region, if the memory instruction is a + // copyable. It may have memory deps, which must be recalculated. + SmallVector<ScheduleData *> ControlDependentMembers; + auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) { + SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps; + for (ScheduleEntity *SE : Bundle.getBundle()) { + if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) { + if (ScheduleData *BundleMember = getScheduleData(SD->getInst()); + BundleMember && BundleMember->hasValidDependencies()) { + BundleMember->clearDirectDependencies(); + if (RegionHasStackSave || + !isGuaranteedToTransferExecutionToSuccessor( + BundleMember->getInst())) + ControlDependentMembers.push_back(BundleMember); + } + continue; + } + auto *SD = cast<ScheduleData>(SE); + for (const Use &U : SD->getInst()->operands()) { + unsigned &NumOps = + UserOpToNumOps + .try_emplace(std::make_pair(SD->getInst(), U.get()), 0) + .first->getSecond(); + ++NumOps; + if (auto *Op = dyn_cast<Instruction>(U.get()); + Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op, + *SLP, NumOps)) { + if (ScheduleData *OpSD = getScheduleData(Op)) { + OpSD->clearDirectDependencies(); + if (RegionHasStackSave || + !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst())) + ControlDependentMembers.push_back(OpSD); + } + } + } + } + }; // The scheduling region got new instructions at the lower end (or it is a // new region for the first bundle). This makes it necessary to // recalculate all dependencies. // It is seldom that this needs to be done a second time after adding the // initial bundle to the region. if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) { - for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - if (ScheduleData *SD = getScheduleData(I)) + for_each(ScheduleDataMap, [&](auto &P) { + if (BB != P.first->getParent()) + return; + ScheduleData *SD = P.second; + if (isInSchedulingRegion(*SD)) SD->clearDependencies(); - } + }); + for_each(ScheduleCopyableDataMapByInst, [&](auto &P) { + for_each(P.second, [&](ScheduleCopyableData *SD) { + if (isInSchedulingRegion(*SD)) + SD->clearDependencies(); + }); + }); ReSchedule = true; } + // Check if the bundle data has deps for copyable elements already. In + // this case need to reset deps and recalculate it. if (Bundle && !Bundle.getBundle().empty()) { + if (S.areInstructionsWithCopyableElements() || + !ScheduleCopyableDataMap.empty()) + CheckIfNeedToClearDeps(Bundle); LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block " << BB->getName() << "\n"); - calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP); + calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP, + ControlDependentMembers); + } else if (!ControlDependentMembers.empty()) { + ScheduleBundle Invalid = ScheduleBundle::invalid(); + calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP, + ControlDependentMembers); } if (ReSchedule) { @@ -20120,7 +20851,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, !ReadyInsts.empty()) { ScheduleEntity *Picked = ReadyInsts.pop_back_val(); assert(Picked->isReady() && "must be ready to schedule"); - schedule(Picked, ReadyInsts); + schedule(*SLP, S, EI, Picked, ReadyInsts); if (Picked == &Bundle) break; } @@ -20129,7 +20860,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) + if (S.isNonSchedulable(V)) continue; if (!extendSchedulingRegion(V, S)) { // If the scheduling region got new instructions at the lower end (or it @@ -20146,11 +20877,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, bool ReSchedule = false; for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) + if (S.isNonSchedulable(V)) continue; + SmallVector<ScheduleCopyableData *> CopyableData = + getScheduleCopyableData(cast<Instruction>(V)); + if (!CopyableData.empty()) { + for (ScheduleCopyableData *SD : CopyableData) + ReadyInsts.remove(SD); + } ScheduleData *BundleMember = getScheduleData(V); - assert(BundleMember && + assert((BundleMember || S.isCopyableElement(V)) && "no ScheduleData for bundle member (maybe not in same basic block)"); + if (!BundleMember) + continue; // Make sure we don't leave the pieces of the bundle in the ready list when // whole bundle might not be ready. @@ -20161,20 +20900,25 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, ReadyInsts.remove(B); } - if (!BundleMember->isScheduled()) + if (!S.isCopyableElement(V) && !BundleMember->isScheduled()) continue; // A bundle member was scheduled as single instruction before and now // needs to be scheduled as part of the bundle. We just get rid of the // existing schedule. + // A bundle member has deps calculated before it was copyable element - need + // to reschedule. LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember << " was already scheduled\n"); ReSchedule = true; } - ScheduleBundle &Bundle = buildBundle(VL, S); + ScheduleBundle &Bundle = buildBundle(VL, S, EI); TryScheduleBundleImpl(ReSchedule, Bundle); if (!Bundle.isReady()) { - for (ScheduleData *BD : Bundle.getBundle()) { + for (ScheduleEntity *BD : Bundle.getBundle()) { + // Copyable data scheduling is just removed. + if (isa<ScheduleCopyableData>(BD)) + continue; if (BD->isReady()) { ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst()); if (Bundles.empty()) { @@ -20187,10 +20931,66 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, } } ScheduledBundlesList.pop_back(); + SmallVector<ScheduleData *> ControlDependentMembers; + SmallPtrSet<Instruction *, 4> Visited; for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) + if (S.isNonSchedulable(V)) continue; - ScheduledBundles.find(cast<Instruction>(V))->getSecond().pop_back(); + auto *I = cast<Instruction>(V); + if (S.isCopyableElement(I)) { + // Remove the copyable data from the scheduling region and restore + // previous mappings. + auto KV = std::make_pair(EI, I); + assert(ScheduleCopyableDataMap.contains(KV) && + "no ScheduleCopyableData for copyable element"); + ScheduleCopyableData *SD = + ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val(); + ScheduleCopyableDataMapByUsers[I].remove(SD); + if (EI.UserTE) { + ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx); + const auto *It = find(Op, I); + assert(It != Op.end() && "Lane not set"); + SmallPtrSet<Instruction *, 4> Visited; + do { + int Lane = std::distance(Op.begin(), It); + assert(Lane >= 0 && "Lane not set"); + if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) && + !EI.UserTE->ReorderIndices.empty()) + Lane = EI.UserTE->ReorderIndices[Lane]; + assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) && + "Couldn't find extract lane"); + auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]); + if (!Visited.insert(In).second) { + It = find(make_range(std::next(It), Op.end()), I); + break; + } + ScheduleCopyableDataMapByInstUser + [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)] + .pop_back(); + It = find(make_range(std::next(It), Op.end()), I); + } while (It != Op.end()); + EdgeInfo UserEI = EI.UserTE->UserTreeIndex; + if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I)) + ScheduleCopyableDataMapByUsers[I].insert(UserCD); + } + if (ScheduleCopyableDataMapByUsers[I].empty()) + ScheduleCopyableDataMapByUsers.erase(I); + ScheduleCopyableDataMap.erase(KV); + // Need to recalculate dependencies for the actual schedule data. + if (ScheduleData *OpSD = getScheduleData(I)) { + OpSD->clearDirectDependencies(); + if (RegionHasStackSave || + !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst())) + ControlDependentMembers.push_back(OpSD); + } + continue; + } + ScheduledBundles.find(I)->getSecond().pop_back(); + } + if (!ControlDependentMembers.empty()) { + ScheduleBundle Invalid = ScheduleBundle::invalid(); + calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP, + ControlDependentMembers); } return std::nullopt; } @@ -20210,10 +21010,6 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion( Value *V, const InstructionsState &S) { Instruction *I = dyn_cast<Instruction>(V); assert(I && "bundle member must be an instruction"); - assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) && - !doesNotNeedToBeScheduled(I) && - "phi nodes/insertelements/extractelements/extractvalues don't need to " - "be scheduled"); if (getScheduleData(I)) return true; if (!ScheduleStart) { @@ -20283,14 +21079,14 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, ScheduleData *CurrentLoadStore = PrevLoadStore; for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { // No need to allocate data for non-schedulable instructions. - if (doesNotNeedToBeScheduled(I)) + if (isa<PHINode>(I)) continue; ScheduleData *SD = ScheduleDataMap.lookup(I); if (!SD) { SD = allocateScheduleDataChunks(); ScheduleDataMap[I] = SD; } - assert(!isInSchedulingRegion(SD) && + assert(!isInSchedulingRegion(*SD) && "new ScheduleData already in scheduling region"); SD->init(SchedulingRegionID, I); @@ -20320,34 +21116,128 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, } } -void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, - bool InsertInReadyList, - BoUpSLP *SLP) { - SmallVector<ScheduleData *> WorkList; - auto ProcessNode = [&](ScheduleData *BundleMember) { +void BoUpSLP::BlockScheduling::calculateDependencies( + ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP, + ArrayRef<ScheduleData *> ControlDeps) { + SmallVector<ScheduleEntity *> WorkList; + auto ProcessNode = [&](ScheduleEntity *SE) { + if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) { + if (CD->hasValidDependencies()) + return; + LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n"); + CD->initDependencies(); + CD->resetUnscheduledDeps(); + const EdgeInfo &EI = CD->getEdgeInfo(); + if (EI.UserTE) { + ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx); + const auto *It = find(Op, CD->getInst()); + assert(It != Op.end() && "Lane not set"); + SmallPtrSet<Instruction *, 4> Visited; + do { + int Lane = std::distance(Op.begin(), It); + assert(Lane >= 0 && "Lane not set"); + if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) && + !EI.UserTE->ReorderIndices.empty()) + Lane = EI.UserTE->ReorderIndices[Lane]; + assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) && + "Couldn't find extract lane"); + auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]); + if (EI.UserTE->isCopyableElement(In)) { + // We may have not have related copyable scheduling data, if the + // instruction is non-schedulable. + if (ScheduleCopyableData *UseSD = + getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) { + CD->incDependencies(); + if (!UseSD->isScheduled()) + CD->incrementUnscheduledDeps(1); + if (!UseSD->hasValidDependencies() || + (InsertInReadyList && UseSD->isReady())) + WorkList.push_back(UseSD); + } + } else if (Visited.insert(In).second) { + if (ScheduleData *UseSD = getScheduleData(In)) { + CD->incDependencies(); + if (!UseSD->isScheduled()) + CD->incrementUnscheduledDeps(1); + if (!UseSD->hasValidDependencies() || + (InsertInReadyList && UseSD->isReady())) + WorkList.push_back(UseSD); + } + } + It = find(make_range(std::next(It), Op.end()), CD->getInst()); + } while (It != Op.end()); + if (CD->isReady() && CD->getDependencies() == 0 && + (EI.UserTE->hasState() && + (EI.UserTE->getMainOp()->getParent() != + CD->getInst()->getParent() || + (isa<PHINode>(EI.UserTE->getMainOp()) && + (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) || + any_of(EI.UserTE->getMainOp()->users(), [&](User *U) { + auto *IU = dyn_cast<Instruction>(U); + if (!IU) + return true; + return IU->getParent() == EI.UserTE->getMainOp()->getParent(); + })))))) { + // If no uses in the block - mark as having pseudo-use, which cannot + // be scheduled. + // Prevents incorrect def-use tracking between external user and + // actual instruction. + CD->incDependencies(); + CD->incrementUnscheduledDeps(1); + } + } + return; + } + auto *BundleMember = cast<ScheduleData>(SE); if (BundleMember->hasValidDependencies()) return; LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n"); BundleMember->initDependencies(); BundleMember->resetUnscheduledDeps(); // Handle def-use chain dependencies. + SmallDenseMap<Value *, unsigned> UserToNumOps; for (User *U : BundleMember->getInst()->users()) { + if (isa<PHINode>(U)) + continue; if (ScheduleData *UseSD = getScheduleData(U)) { + // The operand is a copyable element - skip. + unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond(); + ++NumOps; + if (areAllOperandsReplacedByCopyableData( + cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps)) + continue; BundleMember->incDependencies(); if (!UseSD->isScheduled()) BundleMember->incrementUnscheduledDeps(1); - WorkList.push_back(UseSD); + if (!UseSD->hasValidDependencies() || + (InsertInReadyList && UseSD->isReady())) + WorkList.push_back(UseSD); } } + for (ScheduleCopyableData *UseSD : + getScheduleCopyableDataUsers(BundleMember->getInst())) { + BundleMember->incDependencies(); + if (!UseSD->isScheduled()) + BundleMember->incrementUnscheduledDeps(1); + if (!UseSD->hasValidDependencies() || + (InsertInReadyList && UseSD->isReady())) + WorkList.push_back(UseSD); + } + SmallPtrSet<const Instruction *, 4> Visited; auto MakeControlDependent = [&](Instruction *I) { + // Do not mark control dependent twice. + if (!Visited.insert(I).second) + return; auto *DepDest = getScheduleData(I); assert(DepDest && "must be in schedule window"); DepDest->addControlDependency(BundleMember); BundleMember->incDependencies(); if (!DepDest->isScheduled()) BundleMember->incrementUnscheduledDeps(1); - WorkList.push_back(DepDest); + if (!DepDest->hasValidDependencies() || + (InsertInReadyList && DepDest->isReady())) + WorkList.push_back(DepDest); }; // Any instruction which isn't safe to speculate at the beginning of the @@ -20426,7 +21316,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, for (ScheduleData *DepDest = NextLoadStore; DepDest; DepDest = DepDest->getNextLoadStore()) { - assert(isInSchedulingRegion(DepDest) && "Expected to be in region"); + assert(isInSchedulingRegion(*DepDest) && "Expected to be in region"); // We have two limits to reduce the complexity: // 1) AliasedCheckLimit: It's a small limit to reduce calls to @@ -20449,7 +21339,9 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, BundleMember->incDependencies(); if (!DepDest->isScheduled()) BundleMember->incrementUnscheduledDeps(1); - WorkList.push_back(DepDest); + if (!DepDest->hasValidDependencies() || + (InsertInReadyList && DepDest->isReady())) + WorkList.push_back(DepDest); } // Example, explaining the loop break condition: Let's assume our @@ -20471,13 +21363,25 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, } }; - WorkList.push_back(Bundle.getBundle().front()); + assert((Bundle || !ControlDeps.empty()) && + "expected at least one instruction to schedule"); + if (Bundle) + WorkList.push_back(Bundle.getBundle().front()); + WorkList.append(ControlDeps.begin(), ControlDeps.end()); SmallPtrSet<ScheduleBundle *, 16> Visited; while (!WorkList.empty()) { - ScheduleData *SD = WorkList.pop_back_val(); - ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(SD->getInst()); + ScheduleEntity *SD = WorkList.pop_back_val(); + SmallVector<ScheduleBundle *, 1> CopyableBundle; + ArrayRef<ScheduleBundle *> Bundles; + if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) { + CopyableBundle.push_back(&CD->getBundle()); + Bundles = CopyableBundle; + } else { + Bundles = getScheduleBundles(SD->getInst()); + } if (Bundles.empty()) { - ProcessNode(SD); + if (!SD->hasValidDependencies()) + ProcessNode(SD); if (InsertInReadyList && SD->isReady()) { ReadyInsts.insert(SD); LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n"); @@ -20485,7 +21389,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, continue; } for (ScheduleBundle *Bundle : Bundles) { - if (!Visited.insert(Bundle).second || Bundle->hasValidDependencies()) + if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second) continue; assert(isInSchedulingRegion(*Bundle) && "ScheduleData not in scheduling region"); @@ -20508,23 +21412,40 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, void BoUpSLP::BlockScheduling::resetSchedule() { assert(ScheduleStart && "tried to reset schedule on block which has not been scheduled"); - for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - if (ScheduleData *SD = getScheduleData(I)) { - assert(isInSchedulingRegion(SD) && - "ScheduleData not in scheduling region"); + for_each(ScheduleDataMap, [&](auto &P) { + if (BB != P.first->getParent()) + return; + ScheduleData *SD = P.second; + if (isInSchedulingRegion(*SD)) { SD->setScheduled(/*Scheduled=*/false); SD->resetUnscheduledDeps(); } - for (ScheduleBundle *Bundle : getScheduleBundles(I)) { - assert(isInSchedulingRegion(*Bundle) && - "ScheduleBundle not in scheduling region"); - Bundle->setScheduled(/*Scheduled=*/false); + }); + for_each(ScheduleCopyableDataMapByInst, [&](auto &P) { + for_each(P.second, [&](ScheduleCopyableData *SD) { + if (isInSchedulingRegion(*SD)) { + SD->setScheduled(/*Scheduled=*/false); + SD->resetUnscheduledDeps(); + } + }); + }); + for_each(ScheduledBundles, [&](auto &P) { + for_each(P.second, [&](ScheduleBundle *Bundle) { + if (isInSchedulingRegion(*Bundle)) + Bundle->setScheduled(/*Scheduled=*/false); + }); + }); + // Reset schedule data for copyable elements. + for (auto &P : ScheduleCopyableDataMap) { + if (isInSchedulingRegion(*P.second)) { + P.second->setScheduled(/*Scheduled=*/false); + P.second->resetUnscheduledDeps(); } } ReadyInsts.clear(); } -void BoUpSLP::scheduleBlock(BlockScheduling *BS) { +void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -20562,15 +21483,45 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!Bundle->hasValidDependencies()) BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this); } + SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I); + for (ScheduleCopyableData *SD : reverse(SDs)) { + ScheduleBundle &Bundle = SD->getBundle(); + Bundle.setSchedulingPriority(Idx++); + if (!Bundle.hasValidDependencies()) + BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this); + } continue; } + SmallVector<ScheduleCopyableData *> CopyableData = + BS->getScheduleCopyableDataUsers(I); if (ScheduleData *SD = BS->getScheduleData(I)) { [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I); assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() || - SDTEs.front()->doesNotNeedToSchedule()) && + SDTEs.front()->doesNotNeedToSchedule() || + doesNotNeedToBeScheduled(I)) && "scheduler and vectorizer bundle mismatch"); SD->setSchedulingPriority(Idx++); - continue; + if (!SD->hasValidDependencies() && + (!CopyableData.empty() || + any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) { + assert(TE->isGather() && "expected gather node"); + return TE->hasState() && TE->hasCopyableElements() && + TE->isCopyableElement(I); + }))) { + // Need to calculate deps for these nodes to correctly handle copyable + // dependencies, even if they were cancelled. + // If copyables bundle was cancelled, the deps are cleared and need to + // recalculate them. + ScheduleBundle Bundle; + Bundle.add(SD); + BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this); + } + } + for (ScheduleCopyableData *SD : reverse(CopyableData)) { + ScheduleBundle &Bundle = SD->getBundle(); + Bundle.setSchedulingPriority(Idx++); + if (!Bundle.hasValidDependencies()) + BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this); } } BS->initialFillReadyList(ReadyInsts); @@ -20586,9 +21537,12 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { // Move the scheduled instruction(s) to their dedicated places, if not // there yet. if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) { - for (const ScheduleData *BundleMember : Bundle->getBundle()) { + for (const ScheduleEntity *BundleMember : Bundle->getBundle()) { Instruction *PickedInst = BundleMember->getInst(); - if (!Scheduled.insert(PickedInst).second) + // If copyable must be schedule as part of something else, skip it. + bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst); + if ((IsCopyable && BS->getScheduleData(PickedInst)) || + (!IsCopyable && !Scheduled.insert(PickedInst).second)) continue; if (PickedInst->getNextNode() != LastScheduledInst) PickedInst->moveAfter(LastScheduledInst->getPrevNode()); @@ -20603,7 +21557,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { PickedInst->moveAfter(LastScheduledInst->getPrevNode()); LastScheduledInst = PickedInst; } - BS->schedule(Picked, ReadyInsts); + auto Invalid = InstructionsState::invalid(); + BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts); } // Check that we didn't break any of our invariants. @@ -20965,9 +21920,11 @@ bool BoUpSLP::collectValuesToDemote( return all_of(E.Scalars, [&](Value *V) { if (isa<PoisonValue>(V)) return true; + APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); + if (E.isCopyableElement(V)) + return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL)); auto *I = cast<Instruction>(V); KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL); - APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); return AmtKnownBits.getMaxValue().ult(BitWidth) && MaskedValueIsZero(I->getOperand(0), ShiftedBits, SimplifyQuery(*DL)); @@ -22729,21 +23686,11 @@ public: /// Try to find a reduction tree. bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root, ScalarEvolution &SE, const DataLayout &DL, - const TargetLibraryInfo &TLI, - DominatorTree &DT, TargetTransformInfo &TTI) { + const TargetLibraryInfo &TLI) { RdxKind = HorizontalReduction::getRdxKind(Root); if (!isVectorizable(RdxKind, Root)) return false; - // FMA reduction root - skip. - auto CheckForFMA = [&](Instruction *I) { - return RdxKind == RecurKind::FAdd && - canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI) - .isValid(); - }; - if (CheckForFMA(Root)) - return false; - // Analyze "regular" integer/FP types for reductions - no target-specific // types or pointers. Type *Ty = Root->getType(); @@ -22781,7 +23728,7 @@ public: // Also, do not try to reduce const values, if the operation is not // foldable. if (!EdgeInst || Level > RecursionMaxDepth || - getRdxKind(EdgeInst) != RdxKind || CheckForFMA(EdgeInst) || + getRdxKind(EdgeInst) != RdxKind || IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) || !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || !isVectorizable(RdxKind, EdgeInst) || @@ -23530,7 +24477,7 @@ public: // correct, replace internal uses with undef, and mark for eventual // deletion. #ifndef NDEBUG - SmallSet<Value *, 4> IgnoreSet; + SmallPtrSet<Value *, 4> IgnoreSet; for (ArrayRef<Value *> RdxOps : ReductionOps) IgnoreSet.insert_range(RdxOps); #endif @@ -23843,6 +24790,8 @@ private: case RecurKind::FMinimum: // res = vv break; + case RecurKind::Sub: + case RecurKind::AddChainWithSubs: case RecurKind::Mul: case RecurKind::FMul: case RecurKind::FMulAdd: @@ -23982,6 +24931,8 @@ private: case RecurKind::FMinimum: // res = vv return VectorizedValue; + case RecurKind::Sub: + case RecurKind::AddChainWithSubs: case RecurKind::Mul: case RecurKind::FMul: case RecurKind::FMulAdd: @@ -24086,6 +25037,8 @@ private: auto *Scale = ConstantVector::get(Vals); return Builder.CreateFMul(VectorizedValue, Scale); } + case RecurKind::Sub: + case RecurKind::AddChainWithSubs: case RecurKind::Mul: case RecurKind::FMul: case RecurKind::FMulAdd: @@ -24356,7 +25309,7 @@ bool SLPVectorizerPass::vectorizeHorReduction( if (!isReductionCandidate(Inst)) return nullptr; HorizontalReduction HorRdx; - if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI)) + if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) return nullptr; return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC); }; diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp index f32d57f..e414c12 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp @@ -81,6 +81,7 @@ LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes( case Instruction::Opcode::FPToUI: case Instruction::Opcode::FPToSI: case Instruction::Opcode::FPExt: + case Instruction::Opcode::PtrToAddr: case Instruction::Opcode::PtrToInt: case Instruction::Opcode::IntToPtr: case Instruction::Opcode::SIToFP: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 73babcc..f972efa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -246,8 +246,7 @@ VPTransformState::VPTransformState(const TargetTransformInfo *TTI, IRBuilderBase &Builder, VPlan *Plan, Loop *CurrentParentLoop, Type *CanonicalIVTy) : TTI(TTI), VF(VF), CFG(DT), LI(LI), AC(AC), Builder(Builder), Plan(Plan), - CurrentParentLoop(CurrentParentLoop), TypeAnalysis(CanonicalIVTy), - VPDT(*Plan) {} + CurrentParentLoop(CurrentParentLoop), TypeAnalysis(*Plan), VPDT(*Plan) {} Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) { if (Def->isLiveIn()) @@ -296,27 +295,11 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { if (hasVectorValue(Def)) return Data.VPV2Vector[Def]; - auto GetBroadcastInstrs = [this, Def](Value *V) { - bool SafeToHoist = - !Def->hasDefiningRecipe() || - VPDT.properlyDominates(Def->getDefiningRecipe()->getParent(), - Plan->getVectorPreheader()); - + auto GetBroadcastInstrs = [this](Value *V) { if (VF.isScalar()) return V; - // Place the code for broadcasting invariant variables in the new preheader. - IRBuilder<>::InsertPointGuard Guard(Builder); - if (SafeToHoist) { - BasicBlock *LoopVectorPreHeader = - CFG.VPBB2IRBB[Plan->getVectorPreheader()]; - if (LoopVectorPreHeader) - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - } - - // Place the code for broadcasting invariant variables in the new preheader. // Broadcast the scalar into all locations in the vector. Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); - return Shuf; }; @@ -372,6 +355,9 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { set(Def, VectorValue); } else { assert(!VF.isScalable() && "VF is assumed to be non scalable."); + assert(isa<VPInstruction>(Def) && + "Explicit BuildVector recipes must have" + "handled packing for non-VPInstructions."); // Initialize packing with insertelements to start from poison. VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF)); for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane) @@ -951,28 +937,6 @@ VPlan::~VPlan() { delete BackedgeTakenCount; } -void VPlan::prepareToExecute(Value *VectorTripCountV, VPTransformState &State) { - if (!VectorTripCount.getUnderlyingValue()) - VectorTripCount.setUnderlyingValue(VectorTripCountV); - else - assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV && - "VectorTripCount set earlier must much VectorTripCountV"); - - IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); - Type *TCTy = VectorTripCountV->getType(); - // FIXME: Model VF * UF computation completely in VPlan. - unsigned UF = getUF(); - if (VF.getNumUsers()) { - Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF); - VF.setUnderlyingValue(RuntimeVF); - VFxUF.setUnderlyingValue( - UF > 1 ? Builder.CreateMul(RuntimeVF, ConstantInt::get(TCTy, UF)) - : RuntimeVF); - } else { - VFxUF.setUnderlyingValue(createStepForVF(Builder, TCTy, State.VF, UF)); - } -} - VPIRBasicBlock *VPlan::getExitBlock(BasicBlock *IRBB) const { auto Iter = find_if(getExitBlocks(), [IRBB](const VPIRBasicBlock *VPIRBB) { return VPIRBB->getIRBasicBlock() == IRBB; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c42cdd5..46e55be 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1019,7 +1019,11 @@ public: /// The lane specifies an index into a vector formed by combining all vector /// operands (all operands after the first one). ExtractLane, - + /// Explicit user for the resume phi of the canonical induction in the main + /// VPlan, used by the epilogue vector loop. + ResumeForEpilogue, + /// Returns the value for vscale. + VScale, }; private: @@ -1167,6 +1171,7 @@ public: switch (VPI->getOpcode()) { case VPInstruction::WideIVStep: case VPInstruction::StepVector: + case VPInstruction::VScale: return true; default: return false; @@ -1227,6 +1232,31 @@ public: return getAsRecipe()->getNumOperands(); } + /// Returns an interator range over the incoming values. + VPUser::const_operand_range incoming_values() const { + return make_range(getAsRecipe()->op_begin(), + getAsRecipe()->op_begin() + getNumIncoming()); + } + + using const_incoming_blocks_range = iterator_range<mapped_iterator< + detail::index_iterator, std::function<const VPBasicBlock *(size_t)>>>; + + /// Returns an iterator range over the incoming blocks. + const_incoming_blocks_range incoming_blocks() const { + std::function<const VPBasicBlock *(size_t)> GetBlock = [this](size_t Idx) { + return getIncomingBlock(Idx); + }; + return map_range(index_range(0, getNumIncoming()), GetBlock); + } + + /// Returns an iterator range over pairs of incoming values and corresponding + /// incoming blocks. + detail::zippy<llvm::detail::zip_first, VPUser::const_operand_range, + const_incoming_blocks_range> + incoming_values_and_blocks() const { + return zip_equal(incoming_values(), incoming_blocks()); + } + /// Removes the incoming value for \p IncomingBlock, which must be a /// predecessor. void removeIncomingValueFor(VPBlockBase *IncomingBlock) const; @@ -2298,6 +2328,11 @@ public: VPSlotTracker &SlotTracker) const override; #endif + /// Returns the number of incoming values, also number of incoming blocks. + /// Note that at the moment, VPWidenPointerInductionRecipe only has a single + /// incoming value, its start value. + unsigned getNumIncoming() const override { return 2; } + /// Returns the recurrence kind of the reduction. RecurKind getRecurrenceKind() const { return Kind; } @@ -2408,11 +2443,11 @@ public: // TODO: extend the masked interleaved-group support to reversed access. assert((!Mask || !IG->isReverse()) && "Reversed masked interleave-group not supported."); - for (unsigned i = 0; i < IG->getFactor(); ++i) - if (Instruction *I = IG->getMember(i)) { - if (I->getType()->isVoidTy()) + for (unsigned I = 0; I < IG->getFactor(); ++I) + if (Instruction *Inst = IG->getMember(I)) { + if (Inst->getType()->isVoidTy()) continue; - new VPValue(I, this); + new VPValue(Inst, this); } for (auto *SV : StoredValues) @@ -3076,10 +3111,11 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe, /// using the address to load from, the explicit vector length and an optional /// mask. struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { - VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask) + VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL, + VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(), - {L.getAddr(), &EVL}, L.isConsecutive(), - L.isReverse(), L, L.getDebugLoc()), + {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L, + L.getDebugLoc()), VPValue(this, &getIngredient()) { setMask(Mask); } @@ -3157,11 +3193,11 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe { /// using the value to store, the address to store to, the explicit vector /// length and an optional mask. struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { - VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask) + VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr, VPValue &EVL, + VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(), - {S.getAddr(), S.getStoredValue(), &EVL}, - S.isConsecutive(), S.isReverse(), S, - S.getDebugLoc()) { + {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(), + S.isReverse(), S, S.getDebugLoc()) { setMask(Mask); } @@ -3968,9 +4004,6 @@ public: VPBB->setPlan(this); } - /// Prepare the plan for execution, setting up the required live-in values. - void prepareToExecute(Value *VectorTripCount, VPTransformState &State); - /// Generate the IR code for this VPlan. void execute(VPTransformState *State); diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 4c3cdda..b39231f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -21,8 +21,7 @@ using namespace llvm; #define DEBUG_TYPE "vplan" -VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan) - : Ctx(Plan.getScalarHeader()->getIRBasicBlock()->getContext()) { +VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan) : Ctx(Plan.getContext()) { if (auto LoopRegion = Plan.getVectorLoopRegion()) { if (const auto *CanIV = dyn_cast<VPCanonicalIVPHIRecipe>( &LoopRegion->getEntryBasicBlock()->front())) { @@ -74,6 +73,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case Instruction::ExtractElement: case Instruction::Freeze: case VPInstruction::ReductionStartVector: + case VPInstruction::ResumeForEpilogue: return inferScalarType(R->getOperand(0)); case Instruction::Select: { Type *ResTy = inferScalarType(R->getOperand(1)); @@ -500,7 +500,7 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan( LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); - VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); + VPTypeAnalysis TypeInfo(Plan); const auto &TTICapture = TTI; auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h index cd86d27..c6c4369 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h @@ -58,9 +58,6 @@ class VPTypeAnalysis { Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R); public: - VPTypeAnalysis(Type *CanonicalIVTy) - : CanonicalIVTy(CanonicalIVTy), Ctx(CanonicalIVTy->getContext()) {} - VPTypeAnalysis(const VPlan &Plan); /// Infer the type of \p V. Returns the scalar type of \p V. diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 7e8eff31..b231a84 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -336,12 +336,6 @@ std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG() { return std::move(Plan); } -std::unique_ptr<VPlan> VPlanTransforms::buildPlainCFG(Loop *TheLoop, - LoopInfo &LI) { - PlainCFGBuilder Builder(TheLoop, &LI); - return Builder.buildPlainCFG(); -} - /// Checks if \p HeaderVPB is a loop header block in the plain CFG; that is, it /// has exactly 2 predecessors (preheader and latch), where the block /// dominates the latch and the preheader dominates the block. If it is a @@ -457,10 +451,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB, LatchDL); } -void VPlanTransforms::prepareForVectorization( - VPlan &Plan, Type *InductionTy, PredicatedScalarEvolution &PSE, - bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop, - DebugLoc IVDL, bool HasUncountableEarlyExit, VFRange &Range) { +static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL, + PredicatedScalarEvolution &PSE, Loop *TheLoop) { VPDominatorTree VPDT; VPDT.recalculate(Plan); @@ -486,12 +478,54 @@ void VPlanTransforms::prepareForVectorization( addCanonicalIVRecipes(Plan, HeaderVPBB, LatchVPBB, InductionTy, IVDL); - [[maybe_unused]] bool HandledUncountableEarlyExit = false; + // Create SCEV and VPValue for the trip count. + // We use the symbolic max backedge-taken-count, which works also when + // vectorizing loops with uncountable early exits. + const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount(); + assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) && + "Invalid backedge-taken count"); + ScalarEvolution &SE = *PSE.getSE(); + const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV, + InductionTy, TheLoop); + Plan.setTripCount( + vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE)); + + VPBasicBlock *ScalarPH = Plan.createVPBasicBlock("scalar.ph"); + VPBlockUtils::connectBlocks(ScalarPH, Plan.getScalarHeader()); + + // The connection order corresponds to the operands of the conditional branch, + // with the middle block already connected to the exit block. + VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); + // Also connect the entry block to the scalar preheader. + // TODO: Also introduce a branch recipe together with the minimum trip count + // check. + VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH); + Plan.getEntry()->swapSuccessors(); +} + +std::unique_ptr<VPlan> +VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, + DebugLoc IVDL, PredicatedScalarEvolution &PSE) { + PlainCFGBuilder Builder(TheLoop, &LI); + std::unique_ptr<VPlan> VPlan0 = Builder.buildPlainCFG(); + addInitialSkeleton(*VPlan0, InductionTy, IVDL, PSE, TheLoop); + return VPlan0; +} + +void VPlanTransforms::handleEarlyExits(VPlan &Plan, + bool HasUncountableEarlyExit, + VFRange &Range) { + auto *MiddleVPBB = cast<VPBasicBlock>( + Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]); + auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor()); + VPBlockBase *HeaderVPB = cast<VPBasicBlock>(LatchVPBB->getSuccessors()[1]); + // Disconnect all early exits from the loop leaving it with a single exit from // the latch. Early exits that are countable are left for a scalar epilog. The // condition of uncountable early exits (currently at most one is supported) // is fused into the latch exit, and used to branch from middle block to the // early exit destination. + [[maybe_unused]] bool HandledUncountableEarlyExit = false; for (VPIRBasicBlock *EB : Plan.getExitBlocks()) { for (VPBlockBase *Pred : to_vector(EB->getPredecessors())) { if (Pred == MiddleVPBB) @@ -500,7 +534,8 @@ void VPlanTransforms::prepareForVectorization( assert(!HandledUncountableEarlyExit && "can handle exactly one uncountable early exit"); handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan, - HeaderVPBB, LatchVPBB, Range); + cast<VPBasicBlock>(HeaderVPB), LatchVPBB, + Range); HandledUncountableEarlyExit = true; } else { for (VPRecipeBase &R : EB->phis()) @@ -513,36 +548,18 @@ void VPlanTransforms::prepareForVectorization( assert((!HasUncountableEarlyExit || HandledUncountableEarlyExit) && "missed an uncountable exit that must be handled"); +} - // Create SCEV and VPValue for the trip count. - // We use the symbolic max backedge-taken-count, which works also when - // vectorizing loops with uncountable early exits. - const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount(); - assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) && - "Invalid loop count"); - ScalarEvolution &SE = *PSE.getSE(); - const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV, - InductionTy, TheLoop); - Plan.setTripCount( - vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE)); - - VPBasicBlock *ScalarPH = Plan.createVPBasicBlock("scalar.ph"); - VPBlockUtils::connectBlocks(ScalarPH, Plan.getScalarHeader()); - - // The connection order corresponds to the operands of the conditional branch, - // with the middle block already connected to the exit block. - VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); - // Also connect the entry block to the scalar preheader. - // TODO: Also introduce a branch recipe together with the minimum trip count - // check. - VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH); - Plan.getEntry()->swapSuccessors(); - +void VPlanTransforms::addMiddleCheck(VPlan &Plan, + bool RequiresScalarEpilogueCheck, + bool TailFolded) { + auto *MiddleVPBB = cast<VPBasicBlock>( + Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]); // If MiddleVPBB has a single successor then the original loop does not exit // via the latch and the single successor must be the scalar preheader. // There's no need to add a runtime check to MiddleVPBB. if (MiddleVPBB->getNumSuccessors() == 1) { - assert(MiddleVPBB->getSingleSuccessor() == ScalarPH && + assert(MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader() && "must have ScalarPH as single successor"); return; } @@ -564,6 +581,7 @@ void VPlanTransforms::prepareForVectorization( // the corresponding compare because they may have ended up with different // line numbers and we want to avoid awkward line stepping while debugging. // E.g., if the compare has got a line number inside the loop. + auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor()); DebugLoc LatchDL = LatchVPBB->getTerminator()->getDebugLoc(); VPBuilder Builder(MiddleVPBB); VPValue *Cmp; diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 4154720c..5ad2ac6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -351,10 +351,10 @@ struct VPCostContext { TargetTransformInfo::TargetCostKind CostKind; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, - Type *CanIVTy, LoopVectorizationCostModel &CM, + const VPlan &Plan, LoopVectorizationCostModel &CM, TargetTransformInfo::TargetCostKind CostKind) - : TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()), - CM(CM), CostKind(CostKind) {} + : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), + CostKind(CostKind) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 8818843..9f036fb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -200,15 +200,11 @@ template <typename Ops_t, unsigned Opcode, bool Commutative, struct Recipe_match { Ops_t Ops; - Recipe_match() : Ops() { - static_assert(std::tuple_size<Ops_t>::value == 0 && - "constructor can only be used with zero operands"); - } - Recipe_match(Ops_t Ops) : Ops(Ops) {} - template <typename A_t, typename B_t> - Recipe_match(A_t A, B_t B) : Ops({A, B}) { - static_assert(std::tuple_size<Ops_t>::value == 2 && - "constructor can only be used for binary matcher"); + template <typename... OpTy> Recipe_match(OpTy... Ops) : Ops(Ops...) { + static_assert(std::tuple_size<Ops_t>::value == sizeof...(Ops) && + "number of operands in constructor doesn't match Ops_t"); + static_assert((!Commutative || std::tuple_size<Ops_t>::value == 2) && + "only binary ops can be commutative"); } bool match(const VPValue *V) const { @@ -254,7 +250,6 @@ private: // Check for recipes that do not have opcodes. if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value || std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value || - std::is_same<RecipeTy, VPWidenSelectRecipe>::value || std::is_same<RecipeTy, VPDerivedIVRecipe>::value || std::is_same<RecipeTy, VPWidenGEPRecipe>::value) return DefR; @@ -270,195 +265,128 @@ private: } }; -template <unsigned Opcode, typename... RecipeTys> -using ZeroOpRecipe_match = - Recipe_match<std::tuple<>, Opcode, false, RecipeTys...>; - -template <typename Op0_t, unsigned Opcode, typename... RecipeTys> -using UnaryRecipe_match = - Recipe_match<std::tuple<Op0_t>, Opcode, false, RecipeTys...>; - -template <typename Op0_t, unsigned Opcode> -using UnaryVPInstruction_match = - UnaryRecipe_match<Op0_t, Opcode, VPInstruction>; +template <unsigned Opcode, typename... OpTys> +using AllRecipe_match = + Recipe_match<std::tuple<OpTys...>, Opcode, /*Commutative*/ false, + VPWidenRecipe, VPReplicateRecipe, VPWidenCastRecipe, + VPInstruction, VPWidenSelectRecipe>; -template <unsigned Opcode> -using ZeroOpVPInstruction_match = ZeroOpRecipe_match<Opcode, VPInstruction>; +template <unsigned Opcode, typename... OpTys> +using AllRecipe_commutative_match = + Recipe_match<std::tuple<OpTys...>, Opcode, /*Commutative*/ true, + VPWidenRecipe, VPReplicateRecipe, VPInstruction>; -template <typename Op0_t, unsigned Opcode> -using AllUnaryRecipe_match = - UnaryRecipe_match<Op0_t, Opcode, VPWidenRecipe, VPReplicateRecipe, - VPWidenCastRecipe, VPInstruction>; +template <unsigned Opcode, typename... OpTys> +using VPInstruction_match = Recipe_match<std::tuple<OpTys...>, Opcode, + /*Commutative*/ false, VPInstruction>; -template <typename Op0_t, typename Op1_t, unsigned Opcode, bool Commutative, - typename... RecipeTys> -using BinaryRecipe_match = - Recipe_match<std::tuple<Op0_t, Op1_t>, Opcode, Commutative, RecipeTys...>; - -template <typename Op0_t, typename Op1_t, unsigned Opcode> -using BinaryVPInstruction_match = - BinaryRecipe_match<Op0_t, Op1_t, Opcode, /*Commutative*/ false, - VPInstruction>; - -template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode, - bool Commutative, typename... RecipeTys> -using TernaryRecipe_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, - Opcode, Commutative, RecipeTys...>; - -template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode> -using TernaryVPInstruction_match = - TernaryRecipe_match<Op0_t, Op1_t, Op2_t, Opcode, /*Commutative*/ false, - VPInstruction>; - -template <typename Op0_t, typename Op1_t, unsigned Opcode, - bool Commutative = false> -using AllBinaryRecipe_match = - BinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative, VPWidenRecipe, - VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>; +template <unsigned Opcode, typename... OpTys> +inline VPInstruction_match<Opcode, OpTys...> +m_VPInstruction(const OpTys &...Ops) { + return VPInstruction_match<Opcode, OpTys...>(Ops...); +} /// BuildVector is matches only its opcode, w/o matching its operands as the /// number of operands is not fixed. -inline ZeroOpVPInstruction_match<VPInstruction::BuildVector> m_BuildVector() { - return ZeroOpVPInstruction_match<VPInstruction::BuildVector>(); -} - -template <unsigned Opcode, typename Op0_t> -inline UnaryVPInstruction_match<Op0_t, Opcode> -m_VPInstruction(const Op0_t &Op0) { - return UnaryVPInstruction_match<Op0_t, Opcode>(Op0); -} - -template <unsigned Opcode, typename Op0_t, typename Op1_t> -inline BinaryVPInstruction_match<Op0_t, Op1_t, Opcode> -m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1) { - return BinaryVPInstruction_match<Op0_t, Op1_t, Opcode>(Op0, Op1); +inline VPInstruction_match<VPInstruction::BuildVector> m_BuildVector() { + return m_VPInstruction<VPInstruction::BuildVector>(); } -template <unsigned Opcode, typename Op0_t, typename Op1_t, typename Op2_t> -inline TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t, Opcode> -m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { - return TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t, Opcode>( - {Op0, Op1, Op2}); -} - -template <typename Op0_t, typename Op1_t, typename Op2_t, typename Op3_t, - unsigned Opcode, bool Commutative, typename... RecipeTys> -using Recipe4Op_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t, Op3_t>, - Opcode, Commutative, RecipeTys...>; - -template <typename Op0_t, typename Op1_t, typename Op2_t, typename Op3_t, - unsigned Opcode> -using VPInstruction4Op_match = - Recipe4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode, /*Commutative*/ false, - VPInstruction>; - -template <unsigned Opcode, typename Op0_t, typename Op1_t, typename Op2_t, - typename Op3_t> -inline VPInstruction4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode> -m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2, - const Op3_t &Op3) { - return VPInstruction4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode>( - {Op0, Op1, Op2, Op3}); -} template <typename Op0_t> -inline UnaryVPInstruction_match<Op0_t, Instruction::Freeze> +inline VPInstruction_match<Instruction::Freeze, Op0_t> m_Freeze(const Op0_t &Op0) { return m_VPInstruction<Instruction::Freeze>(Op0); } template <typename Op0_t> -inline UnaryVPInstruction_match<Op0_t, VPInstruction::BranchOnCond> +inline VPInstruction_match<VPInstruction::BranchOnCond, Op0_t> m_BranchOnCond(const Op0_t &Op0) { return m_VPInstruction<VPInstruction::BranchOnCond>(Op0); } template <typename Op0_t> -inline UnaryVPInstruction_match<Op0_t, VPInstruction::Broadcast> +inline VPInstruction_match<VPInstruction::Broadcast, Op0_t> m_Broadcast(const Op0_t &Op0) { return m_VPInstruction<VPInstruction::Broadcast>(Op0); } template <typename Op0_t, typename Op1_t> -inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::ActiveLaneMask> +inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t> m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) { return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1); } template <typename Op0_t, typename Op1_t> -inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::BranchOnCount> +inline VPInstruction_match<VPInstruction::BranchOnCount, Op0_t, Op1_t> m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) { return m_VPInstruction<VPInstruction::BranchOnCount>(Op0, Op1); } template <unsigned Opcode, typename Op0_t> -inline AllUnaryRecipe_match<Op0_t, Opcode> m_Unary(const Op0_t &Op0) { - return AllUnaryRecipe_match<Op0_t, Opcode>(Op0); +inline AllRecipe_match<Opcode, Op0_t> m_Unary(const Op0_t &Op0) { + return AllRecipe_match<Opcode, Op0_t>(Op0); } template <typename Op0_t> -inline AllUnaryRecipe_match<Op0_t, Instruction::Trunc> -m_Trunc(const Op0_t &Op0) { +inline AllRecipe_match<Instruction::Trunc, Op0_t> m_Trunc(const Op0_t &Op0) { return m_Unary<Instruction::Trunc, Op0_t>(Op0); } template <typename Op0_t> -inline AllUnaryRecipe_match<Op0_t, Instruction::ZExt> m_ZExt(const Op0_t &Op0) { +inline AllRecipe_match<Instruction::ZExt, Op0_t> m_ZExt(const Op0_t &Op0) { return m_Unary<Instruction::ZExt, Op0_t>(Op0); } template <typename Op0_t> -inline AllUnaryRecipe_match<Op0_t, Instruction::SExt> m_SExt(const Op0_t &Op0) { +inline AllRecipe_match<Instruction::SExt, Op0_t> m_SExt(const Op0_t &Op0) { return m_Unary<Instruction::SExt, Op0_t>(Op0); } template <typename Op0_t> -inline match_combine_or<AllUnaryRecipe_match<Op0_t, Instruction::ZExt>, - AllUnaryRecipe_match<Op0_t, Instruction::SExt>> +inline match_combine_or<AllRecipe_match<Instruction::ZExt, Op0_t>, + AllRecipe_match<Instruction::SExt, Op0_t>> m_ZExtOrSExt(const Op0_t &Op0) { return m_CombineOr(m_ZExt(Op0), m_SExt(Op0)); } -template <unsigned Opcode, typename Op0_t, typename Op1_t, - bool Commutative = false> -inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative> -m_Binary(const Op0_t &Op0, const Op1_t &Op1) { - return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative>(Op0, Op1); +template <unsigned Opcode, typename Op0_t, typename Op1_t> +inline AllRecipe_match<Opcode, Op0_t, Op1_t> m_Binary(const Op0_t &Op0, + const Op1_t &Op1) { + return AllRecipe_match<Opcode, Op0_t, Op1_t>(Op0, Op1); } template <unsigned Opcode, typename Op0_t, typename Op1_t> -inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true> +inline AllRecipe_commutative_match<Opcode, Op0_t, Op1_t> m_c_Binary(const Op0_t &Op0, const Op1_t &Op1) { - return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true>(Op0, Op1); + return AllRecipe_commutative_match<Opcode, Op0_t, Op1_t>(Op0, Op1); } template <typename Op0_t, typename Op1_t> -inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul> -m_Mul(const Op0_t &Op0, const Op1_t &Op1) { +inline AllRecipe_match<Instruction::Mul, Op0_t, Op1_t> m_Mul(const Op0_t &Op0, + const Op1_t &Op1) { return m_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1); } template <typename Op0_t, typename Op1_t> -inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul, - /* Commutative =*/true> +inline AllRecipe_commutative_match<Instruction::Mul, Op0_t, Op1_t> m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) { - return m_Binary<Instruction::Mul, Op0_t, Op1_t, true>(Op0, Op1); + return m_c_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1); } /// Match a binary OR operation. Note that while conceptually the operands can /// be matched commutatively, \p Commutative defaults to false in line with the /// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative /// version of the matcher. -template <typename Op0_t, typename Op1_t, bool Commutative = false> -inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or, Commutative> +template <typename Op0_t, typename Op1_t> +inline AllRecipe_match<Instruction::Or, Op0_t, Op1_t> m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) { - return m_Binary<Instruction::Or, Op0_t, Op1_t, Commutative>(Op0, Op1); + return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1); } template <typename Op0_t, typename Op1_t> -inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or, - /*Commutative*/ true> +inline AllRecipe_commutative_match<Instruction::Or, Op0_t, Op1_t> m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) { - return m_BinaryOr<Op0_t, Op1_t, /*Commutative*/ true>(Op0, Op1); + return m_c_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1); } /// ICmp_match is a variant of BinaryRecipe_match that also binds the comparison @@ -523,9 +451,9 @@ m_SpecificICmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) { template <typename Op0_t, typename Op1_t> using GEPLikeRecipe_match = - BinaryRecipe_match<Op0_t, Op1_t, Instruction::GetElementPtr, false, - VPWidenRecipe, VPReplicateRecipe, VPWidenGEPRecipe, - VPInstruction>; + Recipe_match<std::tuple<Op0_t, Op1_t>, Instruction::GetElementPtr, + /*Commutative*/ false, VPWidenRecipe, VPReplicateRecipe, + VPWidenGEPRecipe, VPInstruction>; template <typename Op0_t, typename Op1_t> inline GEPLikeRecipe_match<Op0_t, Op1_t> m_GetElementPtr(const Op0_t &Op0, @@ -533,22 +461,17 @@ inline GEPLikeRecipe_match<Op0_t, Op1_t> m_GetElementPtr(const Op0_t &Op0, return GEPLikeRecipe_match<Op0_t, Op1_t>(Op0, Op1); } -template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode> -using AllTernaryRecipe_match = - Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, Opcode, false, - VPReplicateRecipe, VPInstruction, VPWidenSelectRecipe>; - template <typename Op0_t, typename Op1_t, typename Op2_t> -inline AllTernaryRecipe_match<Op0_t, Op1_t, Op2_t, Instruction::Select> +inline AllRecipe_match<Instruction::Select, Op0_t, Op1_t, Op2_t> m_Select(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { - return AllTernaryRecipe_match<Op0_t, Op1_t, Op2_t, Instruction::Select>( + return AllRecipe_match<Instruction::Select, Op0_t, Op1_t, Op2_t>( {Op0, Op1, Op2}); } template <typename Op0_t> -inline match_combine_or<UnaryVPInstruction_match<Op0_t, VPInstruction::Not>, - AllBinaryRecipe_match<int_pred_ty<is_all_ones>, Op0_t, - Instruction::Xor, true>> +inline match_combine_or<VPInstruction_match<VPInstruction::Not, Op0_t>, + AllRecipe_commutative_match< + Instruction::Xor, int_pred_ty<is_all_ones>, Op0_t>> m_Not(const Op0_t &Op0) { return m_CombineOr(m_VPInstruction<VPInstruction::Not>(Op0), m_c_Binary<Instruction::Xor>(m_AllOnes(), Op0)); @@ -556,9 +479,8 @@ m_Not(const Op0_t &Op0) { template <typename Op0_t, typename Op1_t> inline match_combine_or< - BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::LogicalAnd>, - AllTernaryRecipe_match<Op0_t, Op1_t, specific_intval<1>, - Instruction::Select>> + VPInstruction_match<VPInstruction::LogicalAnd, Op0_t, Op1_t>, + AllRecipe_match<Instruction::Select, Op0_t, Op1_t, specific_intval<1>>> m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) { return m_CombineOr( m_VPInstruction<VPInstruction::LogicalAnd, Op0_t, Op1_t>(Op0, Op1), @@ -566,15 +488,14 @@ m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) { } template <typename Op0_t, typename Op1_t> -inline AllTernaryRecipe_match<Op0_t, specific_intval<1>, Op1_t, - Instruction::Select> +inline AllRecipe_match<Instruction::Select, Op0_t, specific_intval<1>, Op1_t> m_LogicalOr(const Op0_t &Op0, const Op1_t &Op1) { return m_Select(Op0, m_True(), Op1); } template <typename Op0_t, typename Op1_t, typename Op2_t> -using VPScalarIVSteps_match = - TernaryRecipe_match<Op0_t, Op1_t, Op2_t, 0, false, VPScalarIVStepsRecipe>; +using VPScalarIVSteps_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, 0, + false, VPScalarIVStepsRecipe>; template <typename Op0_t, typename Op1_t, typename Op2_t> inline VPScalarIVSteps_match<Op0_t, Op1_t, Op2_t> diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index 862b930..cdadc33 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -238,14 +238,11 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { // optimizations will clean it up. SmallVector<VPValue *, 2> OperandsWithMask; - unsigned NumIncoming = PhiR->getNumIncoming(); - for (unsigned In = 0; In < NumIncoming; In++) { - const VPBasicBlock *Pred = PhiR->getIncomingBlock(In); - OperandsWithMask.push_back(PhiR->getIncomingValue(In)); - VPValue *EdgeMask = getEdgeMask(Pred, VPBB); + for (const auto &[InVPV, InVPBB] : PhiR->incoming_values_and_blocks()) { + OperandsWithMask.push_back(InVPV); + VPValue *EdgeMask = getEdgeMask(InVPBB, VPBB); if (!EdgeMask) { - assert(In == 0 && "Both null and non-null edge masks found"); - assert(all_equal(PhiR->operands()) && + assert(all_equal(PhiR->incoming_values()) && "Distinct incoming values with one having a full mask"); break; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index e971ba1..7ca9b23 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -452,6 +452,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { switch (Opcode) { case VPInstruction::StepVector: + case VPInstruction::VScale: return 0; case Instruction::Alloca: case Instruction::ExtractValue: @@ -459,6 +460,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case Instruction::Load: case VPInstruction::AnyOf: case VPInstruction::BranchOnCond: + case VPInstruction::BuildStructVector: + case VPInstruction::BuildVector: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExplicitVectorLength: @@ -517,6 +520,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { case VPInstruction::PtrAdd: case VPInstruction::ExplicitVectorLength: case VPInstruction::AnyOf: + case VPInstruction::Not: return true; default: return false; @@ -569,7 +573,8 @@ Value *VPInstruction::generate(VPTransformState &State) { switch (getOpcode()) { case VPInstruction::Not: { - Value *A = State.get(getOperand(0)); + bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); + Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); return Builder.CreateNot(A, Name); } case Instruction::ExtractElement: { @@ -810,10 +815,18 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *RdxPart = RdxParts[Part]; if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); - else - ReducedPartRdx = Builder.CreateBinOp( - (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK), - RdxPart, ReducedPartRdx, "bin.rdx"); + else { + Instruction::BinaryOps Opcode; + // For sub-recurrences, each UF's reduction variable is already + // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1) + if (RK == RecurKind::Sub) + Opcode = Instruction::Add; + else + Opcode = + (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK); + ReducedPartRdx = + Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx"); + } } } @@ -922,6 +935,8 @@ Value *VPInstruction::generate(VPTransformState &State) { return Res; } + case VPInstruction::ResumeForEpilogue: + return State.get(getOperand(0), true); default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -998,6 +1013,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, I32Ty, {Arg0Ty, I32Ty, I1Ty}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); } + case VPInstruction::ExtractLastElement: { + // Add on the cost of extracting the element. + auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement, + VecTy, Ctx.CostKind, 0); + } case VPInstruction::ExtractPenultimateElement: if (VF == ElementCount::getScalable(1)) return InstructionCost::getInvalid(); @@ -1027,6 +1048,8 @@ bool VPInstruction::isSingleScalar() const { switch (getOpcode()) { case Instruction::PHI: case VPInstruction::ExplicitVectorLength: + case VPInstruction::ResumeForEpilogue: + case VPInstruction::VScale: return true; default: return isScalarCast(); @@ -1076,6 +1099,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: + case Instruction::PHI: case VPInstruction::AnyOf: case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: @@ -1093,6 +1117,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::WidePtrAdd: case VPInstruction::StepVector: case VPInstruction::ReductionStartVector: + case VPInstruction::VScale: return false; default: return true; @@ -1116,6 +1141,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case Instruction::Select: case Instruction::Or: case Instruction::Freeze: + case VPInstruction::Not: // TODO: Cover additional opcodes. return vputils::onlyFirstLaneUsed(this); case VPInstruction::ActiveLaneMask: @@ -1251,6 +1277,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ReductionStartVector: O << "reduction-start-vector"; break; + case VPInstruction::ResumeForEpilogue: + O << "resume-for-epilogue"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -1281,6 +1310,12 @@ void VPInstructionWithType::execute(VPTransformState &State) { State.set(this, StepVector); break; } + case VPInstruction::VScale: { + Value *VScale = State.Builder.CreateVScale(ResultTy); + State.set(this, VScale, true); + break; + } + default: llvm_unreachable("opcode not implemented yet"); } @@ -1301,6 +1336,9 @@ void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent, case VPInstruction::StepVector: O << "step-vector " << *ResultTy; break; + case VPInstruction::VScale: + O << "vscale " << *ResultTy; + break; default: assert(Instruction::isCast(getOpcode()) && "unhandled opcode"); O << Instruction::getOpcodeName(getOpcode()) << " "; @@ -1434,12 +1472,12 @@ void VPIRPhi::print(raw_ostream &O, const Twine &Indent, if (getNumOperands() != 0) { O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": "; - interleaveComma( - enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) { - Op.value()->printAsOperand(O, SlotTracker); - O << " from "; - getParent()->getPredecessors()[Op.index()]->printAsOperand(O); - }); + interleaveComma(incoming_values_and_blocks(), O, + [&O, &SlotTracker](auto Op) { + std::get<0>(Op)->printAsOperand(O, SlotTracker); + O << " from "; + std::get<1>(Op)->printAsOperand(O); + }); O << ")"; } } @@ -2934,7 +2972,6 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // transform, avoid computing their cost multiple times for now. Ctx.SkipCostComputation.insert(UI); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Type *ResultTy = Ctx.Types.inferScalarType(this); switch (UI->getOpcode()) { case Instruction::GetElementPtr: @@ -2943,6 +2980,24 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // is scalarized or not. Therefore, we handle GEPs with the memory // instruction cost. return 0; + case Instruction::Call: { + if (!isSingleScalar()) { + // TODO: Handle remaining call costs here as well. + if (VF.isScalable()) + return InstructionCost::getInvalid(); + break; + } + + auto *CalledFn = + cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()); + if (CalledFn->isIntrinsic()) + break; + + SmallVector<Type *, 4> Tys; + for (VPValue *ArgOp : drop_end(operands())) + Tys.push_back(Ctx.Types.inferScalarType(ArgOp)); + return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind); + } case Instruction::Add: case Instruction::Sub: case Instruction::FAdd: @@ -2960,7 +3015,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, auto Op2Info = Ctx.getOperandInfo(getOperand(1)); SmallVector<const Value *, 4> Operands(UI->operand_values()); return Ctx.TTI.getArithmeticInstrCost( - UI->getOpcode(), ResultTy, CostKind, + UI->getOpcode(), ResultTy, Ctx.CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, Op2Info, Operands, UI, &Ctx.TLI) * (isSingleScalar() ? 1 : VF.getFixedValue()); @@ -3097,9 +3152,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, // Currently, ARM will use the underlying IR to calculate gather/scatter // instruction cost. const Value *Ptr = getLoadStorePointerOperand(&Ingredient); + Type *PtrTy = toVectorTy(Ptr->getType(), VF); assert(!Reverse && "Inconsecutive memory access should not have the order."); - return Ctx.TTI.getAddressComputationCost(Ty) + + return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, + Ctx.CostKind) + Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient); } @@ -3445,6 +3502,8 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Lane && "Interleave group being replicated."); + assert((!NeedsMaskForGaps || !State.VF.isScalable()) && + "Masking gaps for scalable vectors is not yet supported."); const InterleaveGroup<Instruction> *Group = IG; Instruction *Instr = Group->getInsertPos(); @@ -3562,8 +3621,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) && "Mismatch between NeedsMaskForGaps and MaskForGaps"); - assert((!MaskForGaps || !State.VF.isScalable()) && - "masking gaps for scalable vectors is not yet supported."); ArrayRef<VPValue *> StoredValues = getStoredValues(); // Collect the stored vector from each member. SmallVector<Value *, 4> StoredVecs; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 1c8bd6c..cff43c2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -517,10 +517,7 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) { // everything WidenNewIV's users need. That is, WidenOriginalIV will // generate a vector phi or all users of WidenNewIV demand the first lane // only. - if (any_of(WidenOriginalIV->users(), - [WidenOriginalIV](VPUser *U) { - return !U->usesScalars(WidenOriginalIV); - }) || + if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) || vputils::onlyFirstLaneUsed(WidenNewIV)) { WidenNewIV->replaceAllUsesWith(WidenOriginalIV); WidenNewIV->eraseFromParent(); @@ -553,8 +550,22 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { // The recipes in the block are processed in reverse order, to catch chains // of dead recipes. for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { - if (isDeadRecipe(R)) + if (isDeadRecipe(R)) { R.eraseFromParent(); + continue; + } + + // Check if R is a dead VPPhi <-> update cycle and remove it. + auto *PhiR = dyn_cast<VPPhi>(&R); + if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1) + continue; + VPValue *Incoming = PhiR->getOperand(1); + if (*PhiR->user_begin() != Incoming->getDefiningRecipe() || + Incoming->getNumUsers() != 1) + continue; + PhiR->replaceAllUsesWith(PhiR->getOperand(0)); + PhiR->eraseFromParent(); + Incoming->getDefiningRecipe()->eraseFromParent(); } } } @@ -571,8 +582,7 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx"); // Truncate base induction if needed. - Type *CanonicalIVType = CanonicalIV->getScalarType(); - VPTypeAnalysis TypeInfo(CanonicalIVType); + VPTypeAnalysis TypeInfo(Plan); Type *ResultTy = TypeInfo.inferScalarType(BaseIV); if (TruncI) { Type *TruncTy = TruncI->getType(); @@ -868,7 +878,7 @@ optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, void VPlanTransforms::optimizeInductionExitUsers( VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues) { VPBlockBase *MiddleVPBB = Plan.getMiddleBlock(); - VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); + VPTypeAnalysis TypeInfo(Plan); for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) { for (VPRecipeBase &R : ExitVPBB->phis()) { auto *ExitIRI = cast<VPIRPhi>(&R); @@ -970,10 +980,11 @@ static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0], Ops[1], cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags()); - case Instruction::InsertElement: - return Folder.FoldInsertElement(Ops[0], Ops[1], Ops[2]); + // An extract of a live-in is an extract of a broadcast, so return the + // broadcasted element. case Instruction::ExtractElement: - return Folder.FoldExtractElement(Ops[0], Ops[1]); + assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar"); + return Ops[0]; } return nullptr; } @@ -1041,7 +1052,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { #ifndef NDEBUG // Verify that the cached type info is for both A and its users is still // accurate by comparing it to freshly computed types. - VPTypeAnalysis TypeInfo2(Plan->getCanonicalIV()->getScalarType()); + VPTypeAnalysis TypeInfo2(*Plan); assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); for (VPUser *U : A->users()) { auto *R = cast<VPRecipeBase>(U); @@ -1202,9 +1213,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } - if (match(Def, - m_VPInstruction<VPInstruction::ExtractLastElement>( - m_VPInstruction<VPInstruction::Broadcast>(m_VPValue(A))))) { + if (match(Def, m_VPInstruction<VPInstruction::ExtractLastElement>( + m_Broadcast(m_VPValue(A))))) { Def->replaceAllUsesWith(A); return; } @@ -1218,10 +1228,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { } } -void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) { +void VPlanTransforms::simplifyRecipes(VPlan &Plan) { ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( Plan.getEntry()); - VPTypeAnalysis TypeInfo(&CanonicalIVTy); + VPTypeAnalysis TypeInfo(Plan); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { simplifyRecipe(R, TypeInfo); @@ -1251,9 +1261,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { // scalar results used. In the latter case, we would introduce extra // broadcasts. if (!vputils::isSingleScalar(RepOrWidenR) || - any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) { - return !U->usesScalars(RepOrWidenR); - })) + !vputils::onlyScalarValuesUsed(RepOrWidenR)) continue; auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(), @@ -1485,7 +1493,6 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, // the region, otherwise replace the terminator controlling the latch with // (BranchOnCond true). auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry()); - auto *CanIVTy = Plan.getCanonicalIV()->getScalarType(); if (all_of(Header->phis(), IsaPred<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe, VPPhi>)) { @@ -1505,7 +1512,7 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, VPBlockUtils::connectBlocks(Preheader, Header); VPBlockUtils::connectBlocks(ExitingVPBB, Exit); - VPlanTransforms::simplifyRecipes(Plan, *CanIVTy); + VPlanTransforms::simplifyRecipes(Plan); } else { // The vector region contains header phis for which we cannot remove the // loop region yet. @@ -1748,7 +1755,8 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { if (!PhiR) continue; RecurKind RK = PhiR->getRecurrenceKind(); - if (RK != RecurKind::Add && RK != RecurKind::Mul) + if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub && + RK != RecurKind::AddChainWithSubs) continue; for (VPUser *U : collectUsersRecursively(PhiR)) @@ -1799,8 +1807,7 @@ void VPlanTransforms::truncateToMinimalBitwidths( // other uses have different types for their operands, making them invalidly // typed. DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs; - Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); - VPTypeAnalysis TypeInfo(CanonicalIVType); + VPTypeAnalysis TypeInfo(Plan); VPBasicBlock *PH = Plan.getVectorPreheader(); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_deep(Plan.getVectorLoopRegion()))) { @@ -1828,8 +1835,7 @@ void VPlanTransforms::truncateToMinimalBitwidths( assert(OldResTy->isIntegerTy() && "only integer types supported"); (void)OldResSizeInBits; - LLVMContext &Ctx = CanonicalIVType->getContext(); - auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits); + auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits); // Any wrapping introduced by shrinking this operation shouldn't be // considered undefined behavior. So, we can't unconditionally copy @@ -1920,13 +1926,13 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(removeRedundantCanonicalIVs, Plan); runPass(removeRedundantInductionCasts, Plan); - runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType()); + runPass(simplifyRecipes, Plan); runPass(simplifyBlends, Plan); runPass(removeDeadRecipes, Plan); runPass(narrowToSingleScalarRecipes, Plan); runPass(legalizeAndOptimizeInductions, Plan); runPass(removeRedundantExpandSCEVRecipes, Plan); - runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType()); + runPass(simplifyRecipes, Plan); runPass(removeBranchOnConst, Plan); runPass(removeDeadRecipes, Plan); @@ -2039,11 +2045,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( return LaneMaskPhi; } -/// Collect all VPValues representing a header mask through the (ICMP_ULE, -/// WideCanonicalIV, backedge-taken-count) pattern. +/// Collect the header mask with the pattern: +/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count) /// TODO: Introduce explicit recipe for header-mask instead of searching /// for the header-mask pattern manually. -static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) { +static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) { SmallVector<VPValue *> WideCanonicalIVs; auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(), @@ -2067,21 +2073,22 @@ static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) { WideCanonicalIVs.push_back(WidenOriginalIV); } - // Walk users of wide canonical IVs and collect to all compares of the form + // Walk users of wide canonical IVs and find the single compare of the form // (ICMP_ULE, WideCanonicalIV, backedge-taken-count). - SmallVector<VPValue *> HeaderMasks; + VPSingleDefRecipe *HeaderMask = nullptr; for (auto *Wide : WideCanonicalIVs) { for (VPUser *U : SmallVector<VPUser *>(Wide->users())) { - auto *HeaderMask = dyn_cast<VPInstruction>(U); - if (!HeaderMask || !vputils::isHeaderMask(HeaderMask, Plan)) + auto *VPI = dyn_cast<VPInstruction>(U); + if (!VPI || !vputils::isHeaderMask(VPI, Plan)) continue; - assert(HeaderMask->getOperand(0) == Wide && + assert(VPI->getOperand(0) == Wide && "WidenCanonicalIV must be the first operand of the compare"); - HeaderMasks.push_back(HeaderMask); + assert(!HeaderMask && "Multiple header masks found?"); + HeaderMask = VPI; } } - return HeaderMasks; + return HeaderMask; } void VPlanTransforms::addActiveLaneMask( @@ -2097,6 +2104,7 @@ void VPlanTransforms::addActiveLaneMask( [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }); assert(FoundWidenCanonicalIVUser && "Must have widened canonical IV when tail folding!"); + VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan); auto *WideCanonicalIV = cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser); VPSingleDefRecipe *LaneMask; @@ -2110,11 +2118,11 @@ void VPlanTransforms::addActiveLaneMask( "active.lane.mask"); } - // Walk users of WideCanonicalIV and replace all compares of the form - // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an - // active-lane-mask. - for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) - HeaderMask->replaceAllUsesWith(LaneMask); + // Walk users of WideCanonicalIV and replace the header mask of the form + // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask, + // removing the old one to ensure there is always only a single header mask. + HeaderMask->replaceAllUsesWith(LaneMask); + HeaderMask->eraseFromParent(); } /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding @@ -2130,6 +2138,8 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &AllOneMask, VPValue &EVL) { + // FIXME: Don't transform recipes to EVL recipes if they're not masked by the + // header mask. auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { assert(OrigMask && "Unmasked recipe when folding tail"); // HeaderMask will be handled using EVL. @@ -2139,14 +2149,35 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, return HeaderMask == OrigMask ? nullptr : OrigMask; }; + /// Adjust any end pointers so that they point to the end of EVL lanes not VF. + auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * { + auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr); + if (!EndPtr) + return Addr; + assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() && + "VPVectorEndPointerRecipe with non-VF VF operand?"); + assert( + all_of(EndPtr->users(), + [](VPUser *U) { + return cast<VPWidenMemoryRecipe>(U)->isReverse(); + }) && + "VPVectorEndPointRecipe not used by reversed widened memory recipe?"); + VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone(); + EVLAddr->insertBefore(&CurRecipe); + EVLAddr->setOperand(1, &EVL); + return EVLAddr; + }; + return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe) .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) { VPValue *NewMask = GetNewMask(L->getMask()); - return new VPWidenLoadEVLRecipe(*L, EVL, NewMask); + VPValue *NewAddr = GetNewAddr(L->getAddr()); + return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask); }) .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) { VPValue *NewMask = GetNewMask(S->getMask()); - return new VPWidenStoreEVLRecipe(*S, EVL, NewMask); + VPValue *NewAddr = GetNewAddr(S->getAddr()); + return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask); }) .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) { VPValue *NewMask = GetNewMask(Red->getCondOp()); @@ -2172,9 +2203,7 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, /// Replace recipes with their EVL variants. static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { - Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); - VPTypeAnalysis TypeInfo(CanonicalIVType); - LLVMContext &Ctx = CanonicalIVType->getContext(); + VPTypeAnalysis TypeInfo(Plan); VPValue *AllOneMask = Plan.getTrue(); VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); @@ -2183,7 +2212,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>) && "User of VF that we can't transform to EVL."); - Plan.getVF().replaceAllUsesWith(&EVL); + Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) { + return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U); + }); assert(all_of(Plan.getVFxUF().users(), [&Plan](VPUser *U) { @@ -2213,9 +2244,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPValue *MaxEVL = &Plan.getVF(); // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer. VPBuilder Builder(LoopRegion->getPreheaderVPBB()); - MaxEVL = Builder.createScalarZExtOrTrunc(MaxEVL, Type::getInt32Ty(Ctx), - TypeInfo.inferScalarType(MaxEVL), - DebugLoc()); + MaxEVL = Builder.createScalarZExtOrTrunc( + MaxEVL, Type::getInt32Ty(Plan.getContext()), + TypeInfo.inferScalarType(MaxEVL), DebugLoc()); Builder.setInsertPoint(Header, Header->getFirstNonPhi()); VPValue *PrevEVL = @@ -2230,7 +2261,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { m_VPValue(V1), m_VPValue(V2)))) continue; VPValue *Imm = Plan.getOrAddLiveIn( - ConstantInt::getSigned(Type::getInt32Ty(Ctx), -1)); + ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1)); VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe( Intrinsic::experimental_vp_splice, {V1, V2, Imm, AllOneMask, PrevEVL, &EVL}, @@ -2242,47 +2273,51 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { } } + VPValue *HeaderMask = findHeaderMask(Plan); + if (!HeaderMask) + return; + + // Replace header masks with a mask equivalent to predicating by EVL: + // + // icmp ule widen-canonical-iv backedge-taken-count + // -> + // icmp ult step-vector, EVL + VPRecipeBase *EVLR = EVL.getDefiningRecipe(); + VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator())); + Type *EVLType = TypeInfo.inferScalarType(&EVL); + VPValue *EVLMask = Builder.createICmp( + CmpInst::ICMP_ULT, + Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL); + HeaderMask->replaceAllUsesWith(EVLMask); + ToErase.push_back(HeaderMask->getDefiningRecipe()); + // Try to optimize header mask recipes away to their EVL variants. - for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { - // TODO: Split optimizeMaskToEVL out and move into - // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in - // tryToBuildVPlanWithVPRecipes beforehand. - for (VPUser *U : collectUsersRecursively(HeaderMask)) { - auto *CurRecipe = cast<VPRecipeBase>(U); - VPRecipeBase *EVLRecipe = - optimizeMaskToEVL(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL); - if (!EVLRecipe) - continue; + // TODO: Split optimizeMaskToEVL out and move into + // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in + // tryToBuildVPlanWithVPRecipes beforehand. + for (VPUser *U : collectUsersRecursively(EVLMask)) { + auto *CurRecipe = cast<VPRecipeBase>(U); + VPRecipeBase *EVLRecipe = + optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL); + if (!EVLRecipe) + continue; - [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); - assert(NumDefVal == CurRecipe->getNumDefinedValues() && - "New recipe must define the same number of values as the " - "original."); - assert( - NumDefVal <= 1 && - "Only supports recipes with a single definition or without users."); - EVLRecipe->insertBefore(CurRecipe); - if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) { - VPValue *CurVPV = CurRecipe->getVPSingleValue(); - CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); - } - ToErase.push_back(CurRecipe); + [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); + assert(NumDefVal == CurRecipe->getNumDefinedValues() && + "New recipe must define the same number of values as the " + "original."); + assert(NumDefVal <= 1 && + "Only supports recipes with a single definition or without users."); + EVLRecipe->insertBefore(CurRecipe); + if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) { + VPValue *CurVPV = CurRecipe->getVPSingleValue(); + CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); } - - // Replace header masks with a mask equivalent to predicating by EVL: - // - // icmp ule widen-canonical-iv backedge-taken-count - // -> - // icmp ult step-vector, EVL - VPRecipeBase *EVLR = EVL.getDefiningRecipe(); - VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator())); - Type *EVLType = TypeInfo.inferScalarType(&EVL); - VPValue *EVLMask = Builder.createICmp( - CmpInst::ICMP_ULT, - Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL); - HeaderMask->replaceAllUsesWith(EVLMask); - ToErase.push_back(HeaderMask->getDefiningRecipe()); + ToErase.push_back(CurRecipe); } + // Remove dead EVL mask. + if (EVLMask->getNumUsers() == 0) + ToErase.push_back(EVLMask->getDefiningRecipe()); for (VPRecipeBase *R : reverse(ToErase)) { SmallVector<VPValue *> PossiblyDead(R->operands()); @@ -2368,7 +2403,7 @@ void VPlanTransforms::addExplicitVectorLength( Builder.setInsertPoint(CanonicalIVIncrement); VPValue *OpVPEVL = VPEVL; - auto *I32Ty = Type::getInt32Ty(CanIVTy->getContext()); + auto *I32Ty = Type::getInt32Ty(Plan.getContext()); OpVPEVL = Builder.createScalarZExtOrTrunc( OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc()); @@ -2579,10 +2614,10 @@ void VPlanTransforms::createInterleaveGroups( auto *InsertPos = cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos)); - bool InBounds = false; + GEPNoWrapFlags NW = GEPNoWrapFlags::none(); if (auto *Gep = dyn_cast<GetElementPtrInst>( getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts())) - InBounds = Gep->isInBounds(); + NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap(); // Get or create the start address for the interleave group. auto *Start = @@ -2606,8 +2641,7 @@ void VPlanTransforms::createInterleaveGroups( VPValue *OffsetVPV = Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset)); VPBuilder B(InsertPos); - Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV) - : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV); + Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW); } // If the group is reverse, adjust the index to refer to the last vector // lane instead of the first. We adjust the index from the first vector @@ -2616,9 +2650,7 @@ void VPlanTransforms::createInterleaveGroups( if (IG->isReverse()) { auto *ReversePtr = new VPVectorEndPointerRecipe( Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), - -(int64_t)IG->getFactor(), - InBounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none(), - InsertPos->getDebugLoc()); + -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc()); ReversePtr->insertBefore(InsertPos); Addr = ReversePtr; } @@ -2711,7 +2743,7 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, // Construct the initial value of the vector IV in the vector loop preheader. Type *IVIntTy = - IntegerType::get(StepTy->getContext(), StepTy->getScalarSizeInBits()); + IntegerType::get(Plan->getContext(), StepTy->getScalarSizeInBits()); VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy); if (StepTy->isFloatingPointTy()) Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy); @@ -2838,9 +2870,8 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) { R->dissolveToCFGLoop(); } -void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, - Type &CanonicalIVTy) { - VPTypeAnalysis TypeInfo(&CanonicalIVTy); +void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) { + VPTypeAnalysis TypeInfo(Plan); SmallVector<VPRecipeBase *> ToRemove; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_deep(Plan.getEntry()))) { @@ -3204,8 +3235,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) { auto *VectorPreheader = Plan.getVectorPreheader(); for (VPValue *VPV : VPValues) { - if (all_of(VPV->users(), - [VPV](VPUser *U) { return U->usesScalars(VPV); }) || + if (vputils::onlyScalarValuesUsed(VPV) || (VPV->isLiveIn() && VPV->getLiveInIRValue() && isa<Constant>(VPV->getLiveInIRValue()))) continue; @@ -3278,6 +3308,149 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, BTC->replaceAllUsesWith(TCMO); } +void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { + if (Plan.hasScalarVFOnly()) + return; + + VPTypeAnalysis TypeInfo(Plan); + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_shallow(Plan.getEntry())); + auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_shallow(LoopRegion->getEntry())); + // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes, + // excluding ones in replicate regions. Those are not materialized explicitly + // yet. Those vector users are still handled in VPReplicateRegion::execute(), + // via shouldPack(). + // TODO: materialize build vectors for replicating recipes in replicating + // regions. + // TODO: materialize build vectors for VPInstructions. + for (VPBasicBlock *VPBB : + concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *RepR = dyn_cast<VPReplicateRecipe>(&R); + auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) { + VPRegionBlock *ParentRegion = + cast<VPRecipeBase>(U)->getParent()->getParent(); + return !U->usesScalars(RepR) || ParentRegion != LoopRegion; + }; + if (!RepR || RepR->isSingleScalar() || + none_of(RepR->users(), UsesVectorOrInsideReplicateRegion)) + continue; + + Type *ScalarTy = TypeInfo.inferScalarType(RepR); + unsigned Opcode = ScalarTy->isStructTy() + ? VPInstruction::BuildStructVector + : VPInstruction::BuildVector; + auto *BuildVector = new VPInstruction(Opcode, {RepR}); + BuildVector->insertAfter(RepR); + + RepR->replaceUsesWithIf( + BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion]( + VPUser &U, unsigned) { + return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U); + }); + } + } +} + +void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, + VPBasicBlock *VectorPHVPBB, + bool TailByMasking, + bool RequiresScalarEpilogue) { + VPValue &VectorTC = Plan.getVectorTripCount(); + assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in"); + // There's nothing to do if there are no users of the vector trip count or its + // IR value has already been set. + if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue()) + return; + + VPValue *TC = Plan.getTripCount(); + Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC); + VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin()); + VPValue *Step = &Plan.getVFxUF(); + + // If the tail is to be folded by masking, round the number of iterations N + // up to a multiple of Step instead of rounding down. This is done by first + // adding Step-1 and then rounding down. Note that it's ok if this addition + // overflows: the vector induction variable will eventually wrap to zero given + // that it starts at zero and its Step is a power of two; the loop will then + // exit, with the last early-exit vector comparison also producing all-true. + // For scalable vectors the VF is not guaranteed to be a power of 2, but this + // is accounted for in emitIterationCountCheck that adds an overflow check. + if (TailByMasking) { + TC = Builder.createNaryOp( + Instruction::Add, + {TC, Builder.createNaryOp( + Instruction::Sub, + {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})}, + DebugLoc::getCompilerGenerated(), "n.rnd.up"); + } + + // Now we need to generate the expression for the part of the loop that the + // vectorized body will execute. This is equal to N - (N % Step) if scalar + // iterations are not required for correctness, or N - Step, otherwise. Step + // is equal to the vectorization factor (number of SIMD elements) times the + // unroll factor (number of SIMD instructions). + VPValue *R = + Builder.createNaryOp(Instruction::URem, {TC, Step}, + DebugLoc::getCompilerGenerated(), "n.mod.vf"); + + // There are cases where we *must* run at least one iteration in the remainder + // loop. See the cost model for when this can happen. If the step evenly + // divides the trip count, we set the remainder to be equal to the step. If + // the step does not evenly divide the trip count, no adjustment is necessary + // since there will already be scalar iterations. Note that the minimum + // iterations check ensures that N >= Step. + if (RequiresScalarEpilogue) { + assert(!TailByMasking && + "requiring scalar epilogue is not supported with fail folding"); + VPValue *IsZero = Builder.createICmp( + CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0))); + R = Builder.createSelect(IsZero, Step, R); + } + + VPValue *Res = Builder.createNaryOp( + Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec"); + VectorTC.replaceAllUsesWith(Res); +} + +void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, + ElementCount VFEC) { + VPBuilder Builder(VectorPH, VectorPH->begin()); + Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount()); + VPValue &VF = Plan.getVF(); + VPValue &VFxUF = Plan.getVFxUF(); + // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be + // used. + // TODO: Assert that they aren't used. + + // If there are no users of the runtime VF, compute VFxUF by constant folding + // the multiplication of VF and UF. + if (VF.getNumUsers() == 0) { + VPValue *RuntimeVFxUF = + Builder.createElementCount(TCTy, VFEC * Plan.getUF()); + VFxUF.replaceAllUsesWith(RuntimeVFxUF); + return; + } + + // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF * + // vscale) * UF. + VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC); + if (!vputils::onlyScalarValuesUsed(&VF)) { + VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF); + VF.replaceUsesWithIf( + BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); }); + } + VF.replaceAllUsesWith(RuntimeVF); + + VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); + VPValue *MulByUF = Plan.getUF() == 1 ? RuntimeVF + : Builder.createNaryOp(Instruction::Mul, + {RuntimeVF, UF}); + VFxUF.replaceAllUsesWith(MulByUF); +} + /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding @@ -3346,9 +3519,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, if (VF.isScalable() || !VectorLoop) return; - VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); - Type *CanonicalIVType = CanonicalIV->getScalarType(); - VPTypeAnalysis TypeInfo(CanonicalIVType); + VPTypeAnalysis TypeInfo(Plan); unsigned FixedVF = VF.getFixedValue(); SmallVector<VPInterleaveRecipe *> StoreGroups; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index cc50c75..5b3d18b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -54,21 +54,30 @@ struct VPlanTransforms { verifyVPlanIsValid(Plan); } - LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan> buildPlainCFG(Loop *TheLoop, - LoopInfo &LI); - - /// Prepare the plan for vectorization. It will introduce a dedicated - /// VPBasicBlock for the vector pre-header as well as a VPBasicBlock as exit - /// block of the main vector loop (middle.block). If a check is needed to - /// guard executing the scalar epilogue loop, it will be added to the middle - /// block, together with VPBasicBlocks for the scalar preheader and exit - /// blocks. \p InductionTy is the type of the canonical induction and used for - /// related values, like the trip count expression. It also creates a VPValue - /// expression for the original trip count. - LLVM_ABI_FOR_TEST static void prepareForVectorization( - VPlan &Plan, Type *InductionTy, PredicatedScalarEvolution &PSE, - bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop, - DebugLoc IVDL, bool HasUncountableExit, VFRange &Range); + /// Create a base VPlan0, serving as the common starting point for all later + /// candidates. It consists of an initial plain CFG loop with loop blocks from + /// \p TheLoop being directly translated to VPBasicBlocks with VPInstruction + /// corresponding to the input IR. + /// + /// The created loop is wrapped in an initial skeleton to facilitate + /// vectorization, consisting of a vector pre-header, an exit block for the + /// main vector loop (middle.block) and a new block as preheader of the scalar + /// loop (scalar.ph). It also adds a canonical IV and its increment, using \p + /// InductionTy and \p IVDL, and creates a VPValue expression for the original + /// trip count. + LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan> + buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, + PredicatedScalarEvolution &PSE); + + /// Update \p Plan to account for all early exits. + LLVM_ABI_FOR_TEST static void + handleEarlyExits(VPlan &Plan, bool HasUncountableExit, VFRange &Range); + + /// If a check is needed to guard executing the scalar epilogue loop, it will + /// be added to the middle block. + LLVM_ABI_FOR_TEST static void addMiddleCheck(VPlan &Plan, + bool RequiresScalarEpilogueCheck, + bool TailFolded); /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's /// flat CFG into a hierarchical CFG. @@ -113,7 +122,7 @@ struct VPlanTransforms { static void clearReductionWrapFlags(VPlan &Plan); /// Explicitly unroll \p Plan by \p UF. - static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx); + static void unrollByUF(VPlan &Plan, unsigned UF); /// Replace each VPReplicateRecipe outside on any replicate region in \p Plan /// with \p VF single-scalar recipes. @@ -220,9 +229,8 @@ struct VPlanTransforms { /// EVLIVInc, TripCount). static void canonicalizeEVLLoops(VPlan &Plan); - /// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p - /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. - static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy); + /// Lower abstract recipes to concrete ones, that can be codegen'd. + static void convertToConcreteRecipes(VPlan &Plan); /// This function converts initial recipes to the abstract recipes and clamps /// \p Range based on cost model for following optimizations and cost @@ -231,9 +239,8 @@ struct VPlanTransforms { static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range); - /// Perform instcombine-like simplifications on recipes in \p Plan. Use \p - /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. - static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy); + /// Perform instcombine-like simplifications on recipes in \p Plan. + static void simplifyRecipes(VPlan &Plan); /// Remove BranchOnCond recipes with true or false conditions together with /// removing dead edges to their successors. @@ -256,11 +263,25 @@ struct VPlanTransforms { unsigned BestUF, PredicatedScalarEvolution &PSE); + /// Materialize vector trip count computations to a set of VPInstructions. + static void materializeVectorTripCount(VPlan &Plan, + VPBasicBlock *VectorPHVPBB, + bool TailByMasking, + bool RequiresScalarEpilogue); + /// Materialize the backedge-taken count to be computed explicitly using /// VPInstructions. static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH); + /// Add explicit Build[Struct]Vector recipes that combine multiple scalar + /// values into single vectors. + static void materializeBuildVectors(VPlan &Plan); + + /// Materialize VF and VFxUF to be computed explicitly using VPInstructions. + static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, + ElementCount VF); + /// Try to convert a plan with interleave groups with VF elements to a plan /// with the interleave groups replaced by wide loads and stores processing VF /// elements, if all transformed interleave groups access the full vector diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index fc072de..62fd83a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -74,8 +74,7 @@ class UnrollState { } public: - UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx) - : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {} + UnrollState(VPlan &Plan, unsigned UF) : Plan(Plan), UF(UF), TypeInfo(Plan) {} void unrollBlock(VPBlockBase *VPB); @@ -409,7 +408,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { } } -void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { +void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) { assert(UF > 0 && "Unroll factor must be positive"); Plan.setUF(UF); auto Cleanup = make_scope_exit([&Plan]() { @@ -431,7 +430,7 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { return; } - UnrollState Unroller(Plan, UF, Ctx); + UnrollState Unroller(Plan, UF); // Iterate over all blocks in the plan starting from Entry, and unroll // recipes inside them. This includes the vector preheader and middle blocks, @@ -465,10 +464,12 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { VPlanTransforms::removeDeadRecipes(Plan); } -/// Create a single-scalar clone of \p RepR for lane \p Lane. -static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, - Type *IdxTy, VPReplicateRecipe *RepR, - VPLane Lane) { +/// Create a single-scalar clone of \p RepR for lane \p Lane. Use \p +/// Def2LaneDefs to look up scalar definitions for operands of \RepR. +static VPReplicateRecipe * +cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, + VPReplicateRecipe *RepR, VPLane Lane, + const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) { // Collect the operands at Lane, creating extracts as needed. SmallVector<VPValue *> NewOps; for (VPValue *Op : RepR->operands()) { @@ -481,6 +482,14 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op})); continue; } + // If Op is a definition that has been unrolled, directly use the clone for + // the corresponding lane. + auto LaneDefs = Def2LaneDefs.find(Op); + if (LaneDefs != Def2LaneDefs.end()) { + NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]); + continue; + } + // Look through buildvector to avoid unnecessary extracts. if (match(Op, m_BuildVector())) { NewOps.push_back( @@ -513,6 +522,13 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry())); auto VPBBsToUnroll = concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion); + // A mapping of current VPValue definitions to collections of new VPValues + // defined per lane. Serves to hook-up potential users of current VPValue + // definition that are replicated-per-VF later. + DenseMap<VPValue *, SmallVector<VPValue *>> Def2LaneDefs; + // The removal of current recipes being replaced by new ones needs to be + // delayed after Def2LaneDefs is no longer in use. + SmallVector<VPRecipeBase *> ToRemove; for (VPBasicBlock *VPBB : VPBBsToUnroll) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { auto *RepR = dyn_cast<VPReplicateRecipe>(&R); @@ -524,12 +540,12 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { if (isa<StoreInst>(RepR->getUnderlyingInstr()) && vputils::isSingleScalar(RepR->getOperand(1))) { // Stores to invariant addresses need to store the last lane only. - cloneForLane(Plan, Builder, IdxTy, RepR, - VPLane::getLastLaneForVF(VF)); + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF), + Def2LaneDefs); } else { // Create single-scalar version of RepR for all lanes. for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) - cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)); + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs); } RepR->eraseFromParent(); continue; @@ -537,23 +553,33 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { /// Create single-scalar version of RepR for all lanes. SmallVector<VPValue *> LaneDefs; for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) - LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I))); + LaneDefs.push_back( + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs)); + Def2LaneDefs[RepR] = LaneDefs; /// Users that only demand the first lane can use the definition for lane /// 0. RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) { return U.onlyFirstLaneUsed(RepR); }); - // If needed, create a Build(Struct)Vector recipe to insert the scalar - // lane values into a vector. - Type *ResTy = RepR->getUnderlyingInstr()->getType(); - VPValue *VecRes = Builder.createNaryOp( - ResTy->isStructTy() ? VPInstruction::BuildStructVector - : VPInstruction::BuildVector, - LaneDefs); - RepR->replaceAllUsesWith(VecRes); - RepR->eraseFromParent(); + // Update each build vector user that currently has RepR as its only + // operand, to have all LaneDefs as its operands. + for (VPUser *U : to_vector(RepR->users())) { + auto *VPI = dyn_cast<VPInstruction>(U); + if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector && + VPI->getOpcode() != VPInstruction::BuildStructVector)) + continue; + assert(VPI->getNumOperands() == 1 && + "Build(Struct)Vector must have a single operand before " + "replicating by VF"); + VPI->setOperand(0, LaneDefs[0]); + for (VPValue *LaneDef : drop_begin(LaneDefs)) + VPI->addOperand(LaneDef); + } + ToRemove.push_back(RepR); } } + for (auto *R : reverse(ToRemove)) + R->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 14f20c6..b2230c4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -24,6 +24,11 @@ bool vputils::onlyFirstPartUsed(const VPValue *Def) { [Def](const VPUser *U) { return U->onlyFirstPartUsed(Def); }); } +bool vputils::onlyScalarValuesUsed(const VPValue *Def) { + return all_of(Def->users(), + [Def](const VPUser *U) { return U->usesScalars(Def); }); +} + VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE) { if (auto *Expanded = Plan.getSCEVExpansion(Expr)) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 8dcd57f..3cf02b6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -25,6 +25,9 @@ bool onlyFirstLaneUsed(const VPValue *Def); /// Returns true if only the first part of \p Def is used. bool onlyFirstPartUsed(const VPValue *Def); +/// Returns true if only scalar values of \p Def are used by all users. +bool onlyScalarValuesUsed(const VPValue *Def); + /// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p /// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in /// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 3417e1c..e25ffe1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -183,6 +183,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { case Instruction::ZExt: case Instruction::Mul: case Instruction::FMul: + case VPInstruction::Broadcast: // Opcodes above can only use EVL after wide inductions have been // expanded. if (!VerifyLate) { @@ -250,17 +251,15 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { for (const VPUser *U : V->users()) { auto *UI = cast<VPRecipeBase>(U); if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) { - for (unsigned Idx = 0; Idx != Phi->getNumIncoming(); ++Idx) { - VPValue *IncomingVPV = Phi->getIncomingValue(Idx); + for (const auto &[IncomingVPV, IncomingVPBB] : + Phi->incoming_values_and_blocks()) { if (IncomingVPV != V) continue; - const VPBasicBlock *IncomingVPBB = Phi->getIncomingBlock(Idx); if (VPDT.dominates(VPBB, IncomingVPBB)) continue; - errs() << "Incoming def at index " << Idx - << " does not dominate incoming block!\n"; + errs() << "Incoming def does not dominate incoming block!\n"; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) VPSlotTracker Tracker(VPBB->getPlan()); IncomingVPV->getDefiningRecipe()->print(errs(), " ", Tracker); diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 6345b18..1275d53 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" @@ -29,11 +30,13 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <numeric> +#include <optional> #include <queue> #include <set> @@ -74,7 +77,7 @@ public: const DataLayout *DL, TTI::TargetCostKind CostKind, bool TryEarlyFoldsOnly) : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI), - DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), + DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL), TryEarlyFoldsOnly(TryEarlyFoldsOnly) {} bool run(); @@ -88,6 +91,7 @@ private: AssumptionCache &AC; const DataLayout *DL; TTI::TargetCostKind CostKind; + const SimplifyQuery SQ; /// If true, only perform beneficial early IR transforms. Do not introduce new /// vector operations. @@ -107,10 +111,8 @@ private: const Instruction &I, ExtractElementInst *&ConvertToShuffle, unsigned PreferredExtractIndex); - void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, - Instruction &I); - void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, - Instruction &I); + Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I); + Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I); bool foldExtractExtract(Instruction &I); bool foldInsExtFNeg(Instruction &I); bool foldInsExtBinop(Instruction &I); @@ -137,8 +139,10 @@ private: bool foldSelectShuffle(Instruction &I, bool FromReduction = false); bool foldInterleaveIntrinsics(Instruction &I); bool shrinkType(Instruction &I); + bool shrinkLoadForShuffles(Instruction &I); + bool shrinkPhiOfShuffles(Instruction &I); - void replaceValue(Value &Old, Value &New) { + void replaceValue(Instruction &Old, Value &New, bool Erase = true) { LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n'); LLVM_DEBUG(dbgs() << " With: " << New << '\n'); Old.replaceAllUsesWith(&New); @@ -147,7 +151,11 @@ private: Worklist.pushUsersToWorkList(*NewI); Worklist.pushValue(NewI); } - Worklist.pushValue(&Old); + if (Erase && isInstructionTriviallyDead(&Old)) { + eraseInstruction(Old); + } else { + Worklist.push(&Old); + } } void eraseInstruction(Instruction &I) { @@ -158,11 +166,23 @@ private: // Push remaining users of the operands and then the operand itself - allows // further folds that were hindered by OneUse limits. - for (Value *Op : Ops) - if (auto *OpI = dyn_cast<Instruction>(Op)) { - Worklist.pushUsersToWorkList(*OpI); - Worklist.pushValue(OpI); + SmallPtrSet<Value *, 4> Visited; + for (Value *Op : Ops) { + if (Visited.insert(Op).second) { + if (auto *OpI = dyn_cast<Instruction>(Op)) { + if (RecursivelyDeleteTriviallyDeadInstructions( + OpI, nullptr, nullptr, [this](Value *V) { + if (auto I = dyn_cast<Instruction>(V)) { + LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n'); + Worklist.remove(I); + } + })) + continue; + Worklist.pushUsersToWorkList(*OpI); + Worklist.pushValue(OpI); + } } + } } }; } // namespace @@ -546,9 +566,8 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, /// the source vector (shift the scalar element) to a NewIndex for extraction. /// Return null if the input can be constant folded, so that we are not creating /// unnecessary instructions. -static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, - unsigned NewIndex, - IRBuilderBase &Builder) { +static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, + IRBuilderBase &Builder) { // Shufflevectors can only be created for fixed-width vectors. Value *X = ExtElt->getVectorOperand(); if (!isa<FixedVectorType>(X->getType())) @@ -563,52 +582,43 @@ static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(), NewIndex, Builder); - return dyn_cast<ExtractElementInst>( - Builder.CreateExtractElement(Shuf, NewIndex)); + return Shuf; } /// Try to reduce extract element costs by converting scalar compares to vector /// compares followed by extract. -/// cmp (ext0 V0, C), (ext1 V1, C) -void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0, - ExtractElementInst *Ext1, Instruction &I) { +/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex) +Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, + Instruction &I) { assert(isa<CmpInst>(&I) && "Expected a compare"); - assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == - cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && - "Expected matching constant extract indexes"); - // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C + // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex) + // --> extelt (cmp Pred V0, V1), ExtIndex ++NumVecCmp; CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate(); - Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); Value *VecCmp = Builder.CreateCmp(Pred, V0, V1); - Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand()); - replaceValue(I, *NewExt); + return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp"); } /// Try to reduce extract element costs by converting scalar binops to vector /// binops followed by extract. -/// bo (ext0 V0, C), (ext1 V1, C) -void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0, - ExtractElementInst *Ext1, Instruction &I) { +/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex) +Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, + Instruction &I) { assert(isa<BinaryOperator>(&I) && "Expected a binary operator"); - assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == - cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && - "Expected matching constant extract indexes"); - // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C + // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex) + // --> extelt (bo V0, V1), ExtIndex ++NumVecBO; - Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); - Value *VecBO = - Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1); + Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, + V1, "foldExtExtBinop"); // All IR flags are safe to back-propagate because any potential poison // created in unused vector elements is discarded by the extract. if (auto *VecBOInst = dyn_cast<Instruction>(VecBO)) VecBOInst->copyIRFlags(&I); - Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand()); - replaceValue(I, *NewExt); + return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop"); } /// Match an instruction with extracted vector operands. @@ -647,25 +657,29 @@ bool VectorCombine::foldExtractExtract(Instruction &I) { if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex)) return false; + Value *ExtOp0 = Ext0->getVectorOperand(); + Value *ExtOp1 = Ext1->getVectorOperand(); + if (ExtractToChange) { unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0; - ExtractElementInst *NewExtract = + Value *NewExtOp = translateExtract(ExtractToChange, CheapExtractIdx, Builder); - if (!NewExtract) + if (!NewExtOp) return false; if (ExtractToChange == Ext0) - Ext0 = NewExtract; + ExtOp0 = NewExtOp; else - Ext1 = NewExtract; + ExtOp1 = NewExtOp; } - if (Pred != CmpInst::BAD_ICMP_PREDICATE) - foldExtExtCmp(Ext0, Ext1, I); - else - foldExtExtBinop(Ext0, Ext1, I); - + Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand() + : Ext0->getIndexOperand(); + Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE + ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I) + : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I); Worklist.push(Ext0); Worklist.push(Ext1); + replaceValue(I, *NewExt); return true; } @@ -1232,17 +1246,18 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) { // Fold the vector constants in the original vectors into a new base vector to // get more accurate cost modelling. Value *NewVecC = nullptr; - TargetFolder Folder(*DL); if (CI) - NewVecC = Folder.FoldCmp(CI->getPredicate(), VecCs[0], VecCs[1]); + NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ); else if (UO) NewVecC = - Folder.FoldUnOpFMF(UO->getOpcode(), VecCs[0], UO->getFastMathFlags()); + simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ); else if (BO) - NewVecC = Folder.FoldBinOp(BO->getOpcode(), VecCs[0], VecCs[1]); - else if (II->arg_size() == 2) - NewVecC = Folder.FoldBinaryIntrinsic(II->getIntrinsicID(), VecCs[0], - VecCs[1], II->getType(), &I); + NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ); + else if (II) + NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ); + + if (!NewVecC) + return false; // Get cost estimate for the insert element. This cost will factor into // both sequences. @@ -1250,6 +1265,7 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) { InstructionCost NewCost = ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, *Index, NewVecC); + for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) { if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg( II->getIntrinsicID(), Idx, &TTI))) @@ -1294,15 +1310,6 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) { if (auto *ScalarInst = dyn_cast<Instruction>(Scalar)) ScalarInst->copyIRFlags(&I); - // Create a new base vector if the constant folding failed. - if (!NewVecC) { - if (CI) - NewVecC = Builder.CreateCmp(CI->getPredicate(), VecCs[0], VecCs[1]); - else if (UO || BO) - NewVecC = Builder.CreateNAryOp(Opcode, VecCs); - else - NewVecC = Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), VecCs); - } Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index); replaceValue(I, *Insert); return true; @@ -1790,7 +1797,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { ScalarizedCost += TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(), Align(1), LI->getPointerAddressSpace(), CostKind); - ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType()); + ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(), + nullptr, nullptr, CostKind); } LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I @@ -1804,6 +1812,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { // erased in the correct order. Worklist.push(LI); + Type *ElemType = VecTy->getElementType(); + // Replace extracts with narrow scalar loads. for (User *U : LI->users()) { auto *EI = cast<ExtractElementInst>(U); @@ -1817,14 +1827,20 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { Builder.SetInsertPoint(EI); Value *GEP = Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx}); - auto *NewLoad = cast<LoadInst>(Builder.CreateLoad( - VecTy->getElementType(), GEP, EI->getName() + ".scalar")); + auto *NewLoad = cast<LoadInst>( + Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar")); - Align ScalarOpAlignment = computeAlignmentAfterScalarization( - LI->getAlign(), VecTy->getElementType(), Idx, *DL); + Align ScalarOpAlignment = + computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL); NewLoad->setAlignment(ScalarOpAlignment); - replaceValue(*EI, *NewLoad); + if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) { + size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType); + AAMDNodes OldAAMD = LI->getAAMetadata(); + NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL)); + } + + replaceValue(*EI, *NewLoad, false); } FailureGuard.release(); @@ -1856,15 +1872,15 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) { unsigned ExtCnt = 0; bool ExtLane0 = false; for (User *U : Ext->users()) { - const APInt *Idx; - if (!match(U, m_ExtractElt(m_Value(), m_APInt(Idx)))) + uint64_t Idx; + if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx)))) return false; if (cast<Instruction>(U)->use_empty()) continue; ExtCnt += 1; - ExtLane0 |= Idx->isZero(); + ExtLane0 |= !Idx; VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy, - CostKind, Idx->getZExtValue(), U); + CostKind, Idx, U); } InstructionCost ScalarCost = @@ -2910,7 +2926,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { if (!IL.first) return true; Value *V = IL.first->get(); - if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUse()) + if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser()) return false; if (V->getValueID() != FrontV->getValueID()) return false; @@ -3112,7 +3128,7 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask); LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n"); replaceValue(*Shuffle, *NewShuffle); - MadeChanges = true; + return true; } // See if we can re-use foldSelectShuffle, getting it to reduce the size of @@ -3608,7 +3624,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { for (int S = 0, E = ReconstructMasks.size(); S != E; S++) { Builder.SetInsertPoint(Shuffles[S]); Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]); - replaceValue(*Shuffles[S], *NSV); + replaceValue(*Shuffles[S], *NSV, false); } Worklist.pushValue(NSV0A); @@ -3861,6 +3877,228 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) { return true; } +// Attempt to shrink loads that are only used by shufflevector instructions. +bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { + auto *OldLoad = dyn_cast<LoadInst>(&I); + if (!OldLoad || !OldLoad->isSimple()) + return false; + + auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType()); + if (!OldLoadTy) + return false; + + unsigned const OldNumElements = OldLoadTy->getNumElements(); + + // Search all uses of load. If all uses are shufflevector instructions, and + // the second operands are all poison values, find the minimum and maximum + // indices of the vector elements referenced by all shuffle masks. + // Otherwise return `std::nullopt`. + using IndexRange = std::pair<int, int>; + auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> { + IndexRange OutputRange = IndexRange(OldNumElements, -1); + for (llvm::Use &Use : I.uses()) { + // Ensure all uses match the required pattern. + User *Shuffle = Use.getUser(); + ArrayRef<int> Mask; + + if (!match(Shuffle, + m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask)))) + return std::nullopt; + + // Ignore shufflevector instructions that have no uses. + if (Shuffle->use_empty()) + continue; + + // Find the min and max indices used by the shufflevector instruction. + for (int Index : Mask) { + if (Index >= 0 && Index < static_cast<int>(OldNumElements)) { + OutputRange.first = std::min(Index, OutputRange.first); + OutputRange.second = std::max(Index, OutputRange.second); + } + } + } + + if (OutputRange.second < OutputRange.first) + return std::nullopt; + + return OutputRange; + }; + + // Get the range of vector elements used by shufflevector instructions. + if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) { + unsigned const NewNumElements = Indices->second + 1u; + + // If the range of vector elements is smaller than the full load, attempt + // to create a smaller load. + if (NewNumElements < OldNumElements) { + IRBuilder Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + // Calculate costs of old and new ops. + Type *ElemTy = OldLoadTy->getElementType(); + FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements); + Value *PtrOp = OldLoad->getPointerOperand(); + + InstructionCost OldCost = TTI.getMemoryOpCost( + Instruction::Load, OldLoad->getType(), OldLoad->getAlign(), + OldLoad->getPointerAddressSpace(), CostKind); + InstructionCost NewCost = + TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(), + OldLoad->getPointerAddressSpace(), CostKind); + + using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>; + SmallVector<UseEntry, 4u> NewUses; + unsigned const MaxIndex = NewNumElements * 2u; + + for (llvm::Use &Use : I.uses()) { + auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser()); + ArrayRef<int> OldMask = Shuffle->getShuffleMask(); + + // Create entry for new use. + NewUses.push_back({Shuffle, OldMask}); + + // Validate mask indices. + for (int Index : OldMask) { + if (Index >= static_cast<int>(MaxIndex)) + return false; + } + + // Update costs. + OldCost += + TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(), + OldLoadTy, OldMask, CostKind); + NewCost += + TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(), + NewLoadTy, OldMask, CostKind); + } + + LLVM_DEBUG( + dbgs() << "Found a load used only by shufflevector instructions: " + << I << "\n OldCost: " << OldCost + << " vs NewCost: " << NewCost << "\n"); + + if (OldCost < NewCost || !NewCost.isValid()) + return false; + + // Create new load of smaller vector. + auto *NewLoad = cast<LoadInst>( + Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign())); + NewLoad->copyMetadata(I); + + // Replace all uses. + for (UseEntry &Use : NewUses) { + ShuffleVectorInst *Shuffle = Use.first; + std::vector<int> &NewMask = Use.second; + + Builder.SetInsertPoint(Shuffle); + Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc()); + Value *NewShuffle = Builder.CreateShuffleVector( + NewLoad, PoisonValue::get(NewLoadTy), NewMask); + + replaceValue(*Shuffle, *NewShuffle, false); + } + + return true; + } + } + return false; +} + +// Attempt to narrow a phi of shufflevector instructions where the two incoming +// values have the same operands but different masks. If the two shuffle masks +// are offsets of one another we can use one branch to rotate the incoming +// vector and perform one larger shuffle after the phi. +bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) { + auto *Phi = dyn_cast<PHINode>(&I); + if (!Phi || Phi->getNumIncomingValues() != 2u) + return false; + + Value *Op = nullptr; + ArrayRef<int> Mask0; + ArrayRef<int> Mask1; + + if (!match(Phi->getOperand(0u), + m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) || + !match(Phi->getOperand(1u), + m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1))))) + return false; + + auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u)); + + // Ensure result vectors are wider than the argument vector. + auto *InputVT = cast<FixedVectorType>(Op->getType()); + auto *ResultVT = cast<FixedVectorType>(Shuf->getType()); + auto const InputNumElements = InputVT->getNumElements(); + + if (InputNumElements >= ResultVT->getNumElements()) + return false; + + // Take the difference of the two shuffle masks at each index. Ignore poison + // values at the same index in both masks. + SmallVector<int, 16> NewMask; + NewMask.reserve(Mask0.size()); + + for (auto [M0, M1] : zip(Mask0, Mask1)) { + if (M0 >= 0 && M1 >= 0) + NewMask.push_back(M0 - M1); + else if (M0 == -1 && M1 == -1) + continue; + else + return false; + } + + // Ensure all elements of the new mask are equal. If the difference between + // the incoming mask elements is the same, the two must be constant offsets + // of one another. + if (NewMask.empty() || !all_equal(NewMask)) + return false; + + // Create new mask using difference of the two incoming masks. + int MaskOffset = NewMask[0u]; + unsigned Index = (InputNumElements - MaskOffset) % InputNumElements; + NewMask.clear(); + + for (unsigned I = 0u; I < InputNumElements; ++I) { + NewMask.push_back(Index); + Index = (Index + 1u) % InputNumElements; + } + + // Calculate costs for worst cases and compare. + auto const Kind = TTI::SK_PermuteSingleSrc; + auto OldCost = + std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind), + TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind)); + auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) + + TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind); + + LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I + << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost + << "\n"); + + if (NewCost > OldCost) + return false; + + // Create new shuffles and narrowed phi. + auto Builder = IRBuilder(Shuf); + Builder.SetCurrentDebugLocation(Shuf->getDebugLoc()); + auto *PoisonVal = PoisonValue::get(InputVT); + auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask); + Worklist.push(cast<Instruction>(NewShuf0)); + + Builder.SetInsertPoint(Phi); + Builder.SetCurrentDebugLocation(Phi->getDebugLoc()); + auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u); + NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u)); + NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u)); + + Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef()); + PoisonVal = PoisonValue::get(NewPhi->getType()); + auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1); + + replaceValue(*Phi, *NewShuf1); + return true; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. bool VectorCombine::run() { @@ -3873,8 +4111,7 @@ bool VectorCombine::run() { LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n"); - bool MadeChange = false; - auto FoldInst = [this, &MadeChange](Instruction &I) { + auto FoldInst = [this](Instruction &I) { Builder.SetInsertPoint(&I); bool IsVectorType = isa<VectorType>(I.getType()); bool IsFixedVectorType = isa<FixedVectorType>(I.getType()); @@ -3889,10 +4126,12 @@ bool VectorCombine::run() { if (IsFixedVectorType) { switch (Opcode) { case Instruction::InsertElement: - MadeChange |= vectorizeLoadInsert(I); + if (vectorizeLoadInsert(I)) + return true; break; case Instruction::ShuffleVector: - MadeChange |= widenSubvectorLoad(I); + if (widenSubvectorLoad(I)) + return true; break; default: break; @@ -3902,19 +4141,25 @@ bool VectorCombine::run() { // This transform works with scalable and fixed vectors // TODO: Identify and allow other scalable transforms if (IsVectorType) { - MadeChange |= scalarizeOpOrCmp(I); - MadeChange |= scalarizeLoadExtract(I); - MadeChange |= scalarizeExtExtract(I); - MadeChange |= scalarizeVPIntrinsic(I); - MadeChange |= foldInterleaveIntrinsics(I); + if (scalarizeOpOrCmp(I)) + return true; + if (scalarizeLoadExtract(I)) + return true; + if (scalarizeExtExtract(I)) + return true; + if (scalarizeVPIntrinsic(I)) + return true; + if (foldInterleaveIntrinsics(I)) + return true; } if (Opcode == Instruction::Store) - MadeChange |= foldSingleElementStore(I); + if (foldSingleElementStore(I)) + return true; // If this is an early pipeline invocation of this pass, we are done. if (TryEarlyFoldsOnly) - return; + return false; // Otherwise, try folds that improve codegen but may interfere with // early IR canonicalizations. @@ -3923,56 +4168,87 @@ bool VectorCombine::run() { if (IsFixedVectorType) { switch (Opcode) { case Instruction::InsertElement: - MadeChange |= foldInsExtFNeg(I); - MadeChange |= foldInsExtBinop(I); - MadeChange |= foldInsExtVectorToShuffle(I); + if (foldInsExtFNeg(I)) + return true; + if (foldInsExtBinop(I)) + return true; + if (foldInsExtVectorToShuffle(I)) + return true; break; case Instruction::ShuffleVector: - MadeChange |= foldPermuteOfBinops(I); - MadeChange |= foldShuffleOfBinops(I); - MadeChange |= foldShuffleOfSelects(I); - MadeChange |= foldShuffleOfCastops(I); - MadeChange |= foldShuffleOfShuffles(I); - MadeChange |= foldShuffleOfIntrinsics(I); - MadeChange |= foldSelectShuffle(I); - MadeChange |= foldShuffleToIdentity(I); + if (foldPermuteOfBinops(I)) + return true; + if (foldShuffleOfBinops(I)) + return true; + if (foldShuffleOfSelects(I)) + return true; + if (foldShuffleOfCastops(I)) + return true; + if (foldShuffleOfShuffles(I)) + return true; + if (foldShuffleOfIntrinsics(I)) + return true; + if (foldSelectShuffle(I)) + return true; + if (foldShuffleToIdentity(I)) + return true; + break; + case Instruction::Load: + if (shrinkLoadForShuffles(I)) + return true; break; case Instruction::BitCast: - MadeChange |= foldBitcastShuffle(I); + if (foldBitcastShuffle(I)) + return true; break; case Instruction::And: case Instruction::Or: case Instruction::Xor: - MadeChange |= foldBitOpOfCastops(I); + if (foldBitOpOfCastops(I)) + return true; + break; + case Instruction::PHI: + if (shrinkPhiOfShuffles(I)) + return true; break; default: - MadeChange |= shrinkType(I); + if (shrinkType(I)) + return true; break; } } else { switch (Opcode) { case Instruction::Call: - MadeChange |= foldShuffleFromReductions(I); - MadeChange |= foldCastFromReductions(I); + if (foldShuffleFromReductions(I)) + return true; + if (foldCastFromReductions(I)) + return true; break; case Instruction::ICmp: case Instruction::FCmp: - MadeChange |= foldExtractExtract(I); + if (foldExtractExtract(I)) + return true; break; case Instruction::Or: - MadeChange |= foldConcatOfBoolMasks(I); + if (foldConcatOfBoolMasks(I)) + return true; [[fallthrough]]; default: if (Instruction::isBinaryOp(Opcode)) { - MadeChange |= foldExtractExtract(I); - MadeChange |= foldExtractedCmps(I); - MadeChange |= foldBinopOfReductions(I); + if (foldExtractExtract(I)) + return true; + if (foldExtractedCmps(I)) + return true; + if (foldBinopOfReductions(I)) + return true; } break; } } + return false; }; + bool MadeChange = false; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. if (!DT.isReachableFromEntry(&BB)) @@ -3981,7 +4257,7 @@ bool VectorCombine::run() { for (Instruction &I : make_early_inc_range(BB)) { if (I.isDebugOrPseudoInst()) continue; - FoldInst(I); + MadeChange |= FoldInst(I); } } @@ -3995,7 +4271,7 @@ bool VectorCombine::run() { continue; } - FoldInst(*I); + MadeChange |= FoldInst(*I); } return MadeChange; |
