diff options
Diffstat (limited to 'llvm/lib/Target')
49 files changed, 1067 insertions, 384 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index bad7ccd..3c06c6a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8537,7 +8537,7 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI, if (IsCalleeWin64) { UseVarArgCC = true; } else { - UseVarArgCC = !Outs[i].IsFixed; + UseVarArgCC = ArgFlags.isVarArg(); } } @@ -8982,7 +8982,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { - if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector()) + if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector()) report_fatal_error("Passing SVE types to variadic functions is " "currently not supported"); } @@ -13482,7 +13482,7 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, // Look for the first non-undef element. const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); - // Benefit form APInt to handle overflow when calculating expected element. + // Benefit from APInt to handle overflow when calculating expected element. unsigned NumElts = VT.getVectorNumElements(); unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false, @@ -13490,7 +13490,7 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, // The following shuffle indices must be the successive elements after the // first real element. bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) { - return Elt != ExpectedElt++ && Elt != -1; + return Elt != ExpectedElt++ && Elt >= 0; }); if (FoundWrongElt) return false; @@ -14742,6 +14742,106 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { return ResultSLI; } +static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const AArch64TargetLowering &TLI) { + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); + + if (!VT.isVector()) + return SDValue(); + + if (VT.isScalableVector() && !Subtarget.hasSVE2()) + return SDValue(); + + if (VT.isFixedLengthVector() && + (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT))) + return SDValue(); + + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::AND) + return SDValue(); + + SDValue N1 = N->getOperand(1); + if (N1.getOpcode() != ISD::AND) + return SDValue(); + + // InstCombine does (not (neg a)) => (add a -1). + // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c) + // Loop over all combinations of AND operands. + for (int i = 1; i >= 0; --i) { + for (int j = 1; j >= 0; --j) { + SDValue O0 = N0->getOperand(i); + SDValue O1 = N1->getOperand(j); + SDValue Sub, Add, SubSibling, AddSibling; + + // Find a SUB and an ADD operand, one from each AND. + if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) { + Sub = O0; + Add = O1; + SubSibling = N0->getOperand(1 - i); + AddSibling = N1->getOperand(1 - j); + } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) { + Add = O0; + Sub = O1; + AddSibling = N0->getOperand(1 - i); + SubSibling = N1->getOperand(1 - j); + } else + continue; + + if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode())) + continue; + + // Constant ones is always righthand operand of the Add. + if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode())) + continue; + + if (Sub.getOperand(1) != Add.getOperand(0)) + continue; + + return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling); + } + } + + // (or (and a b) (and (not a) c)) => (bsl a b c) + // We only have to look for constant vectors here since the general, variable + // case can be handled in TableGen. + unsigned Bits = VT.getScalarSizeInBits(); + uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); + for (int i = 1; i >= 0; --i) + for (int j = 1; j >= 0; --j) { + APInt Val1, Val2; + + if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) && + ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) && + (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) { + return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i), + N0->getOperand(1 - i), N1->getOperand(1 - j)); + } + BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); + BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); + if (!BVN0 || !BVN1) + continue; + + bool FoundMatch = true; + for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { + ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); + ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); + if (!CN0 || !CN1 || + CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { + FoundMatch = false; + break; + } + } + if (FoundMatch) + return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i), + N0->getOperand(1 - i), N1->getOperand(1 - j)); + } + + return SDValue(); +} + SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { if (useSVEForFixedLengthVectorVT(Op.getValueType(), @@ -15777,6 +15877,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { isREVMask(M, EltSize, NumElts, 32) || isREVMask(M, EltSize, NumElts, 16) || isEXTMask(M, VT, DummyBool, DummyUnsigned) || + isSingletonEXTMask(M, VT, DummyUnsigned) || isTRNMask(M, NumElts, DummyUnsigned) || isUZPMask(M, NumElts, DummyUnsigned) || isZIPMask(M, NumElts, DummyUnsigned) || @@ -19418,106 +19519,6 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, return FixConv; } -static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - const AArch64TargetLowering &TLI) { - EVT VT = N->getValueType(0); - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); - - if (!VT.isVector()) - return SDValue(); - - if (VT.isScalableVector() && !Subtarget.hasSVE2()) - return SDValue(); - - if (VT.isFixedLengthVector() && - (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT))) - return SDValue(); - - SDValue N0 = N->getOperand(0); - if (N0.getOpcode() != ISD::AND) - return SDValue(); - - SDValue N1 = N->getOperand(1); - if (N1.getOpcode() != ISD::AND) - return SDValue(); - - // InstCombine does (not (neg a)) => (add a -1). - // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c) - // Loop over all combinations of AND operands. - for (int i = 1; i >= 0; --i) { - for (int j = 1; j >= 0; --j) { - SDValue O0 = N0->getOperand(i); - SDValue O1 = N1->getOperand(j); - SDValue Sub, Add, SubSibling, AddSibling; - - // Find a SUB and an ADD operand, one from each AND. - if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) { - Sub = O0; - Add = O1; - SubSibling = N0->getOperand(1 - i); - AddSibling = N1->getOperand(1 - j); - } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) { - Add = O0; - Sub = O1; - AddSibling = N0->getOperand(1 - i); - SubSibling = N1->getOperand(1 - j); - } else - continue; - - if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode())) - continue; - - // Constant ones is always righthand operand of the Add. - if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode())) - continue; - - if (Sub.getOperand(1) != Add.getOperand(0)) - continue; - - return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling); - } - } - - // (or (and a b) (and (not a) c)) => (bsl a b c) - // We only have to look for constant vectors here since the general, variable - // case can be handled in TableGen. - unsigned Bits = VT.getScalarSizeInBits(); - uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); - for (int i = 1; i >= 0; --i) - for (int j = 1; j >= 0; --j) { - APInt Val1, Val2; - - if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) && - ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) && - (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) { - return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i), - N0->getOperand(1 - i), N1->getOperand(1 - j)); - } - BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); - BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); - if (!BVN0 || !BVN1) - continue; - - bool FoundMatch = true; - for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { - ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); - ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); - if (!CN0 || !CN1 || - CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { - FoundMatch = false; - break; - } - } - if (FoundMatch) - return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i), - N0->getOperand(1 - i), N1->getOperand(1 - j)); - } - - return SDValue(); -} - // Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to // convert to csel(ccmp(.., cc0)), depending on cc1: diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index b97d622..fd4ef2a 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -8,8 +8,8 @@ // // This pass performs below peephole optimizations on MIR level. // -// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri -// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri +// 1. MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri +// MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri // // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi // MOVi64imm + ADDXrr ==> ADDXri + ADDXri @@ -128,6 +128,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { // Strategy used to split logical immediate bitmasks. enum class SplitStrategy { Intersect, + Disjoint, }; template <typename T> bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI, @@ -163,6 +164,7 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", template <typename T> static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { T UImm = static_cast<T>(Imm); + assert(UImm && (UImm != ~static_cast<T>(0)) && "Invalid immediate!"); // The bitmask immediate consists of consecutive ones. Let's say there is // constant 0b00000000001000000000010000000000 which does not consist of @@ -191,18 +193,47 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { } template <typename T> +static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, + T &Imm2Enc) { + assert(Imm && (Imm != ~static_cast<T>(0)) && "Invalid immediate!"); + + // Try to split a bitmask of the form 0b00000000011000000000011110000000 into + // two disjoint masks such as 0b00000000011000000000000000000000 and + // 0b00000000000000000000011110000000 where the inclusive/exclusive OR of the + // new masks match the original mask. + unsigned LowestBitSet = llvm::countr_zero(Imm); + unsigned LowestGapBitUnset = + LowestBitSet + llvm::countr_one(Imm >> LowestBitSet); + + // Create a mask for the least significant group of consecutive ones. + assert(LowestGapBitUnset < sizeof(T) * CHAR_BIT && "Undefined behaviour!"); + T NewImm1 = (static_cast<T>(1) << LowestGapBitUnset) - + (static_cast<T>(1) << LowestBitSet); + // Create a disjoint mask for the remaining ones. + T NewImm2 = Imm & ~NewImm1; + + // Do not split if NewImm2 is not a valid bitmask immediate. + if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) + return false; + + Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize); + Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize); + return true; +} + +template <typename T> bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI, SplitStrategy Strategy, unsigned OtherOpc) { - // Try below transformation. + // Try below transformations. // - // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri - // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri + // MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri + // MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri // // The mov pseudo instruction could be expanded to multiple mov instructions // later. Let's try to split the constant operand of mov instruction into two - // bitmask immediates. It makes only two AND instructions instead of multiple - // mov + and instructions. + // bitmask immediates based on the given split strategy. It makes only two + // logical instructions instead of multiple mov + logic instructions. return splitTwoPartImm<T>( MI, @@ -224,6 +255,9 @@ bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI, case SplitStrategy::Intersect: SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1); break; + case SplitStrategy::Disjoint: + SplitSucc = splitDisjointBitmaskImm(Imm, RegSize, Imm0, Imm1); + break; } if (SplitSucc) return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc); @@ -889,6 +923,22 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { Changed |= trySplitLogicalImm<uint64_t>( AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri); break; + case AArch64::EORWrr: + Changed |= trySplitLogicalImm<uint32_t>(AArch64::EORWri, MI, + SplitStrategy::Disjoint); + break; + case AArch64::EORXrr: + Changed |= trySplitLogicalImm<uint64_t>(AArch64::EORXri, MI, + SplitStrategy::Disjoint); + break; + case AArch64::ORRWrr: + Changed |= trySplitLogicalImm<uint32_t>(AArch64::ORRWri, MI, + SplitStrategy::Disjoint); + break; + case AArch64::ORRXrr: + Changed |= trySplitLogicalImm<uint64_t>(AArch64::ORRXri, MI, + SplitStrategy::Disjoint); + break; case AArch64::ORRWrs: Changed |= visitORR(MI); break; diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index adc984a..1bc1d98 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -22,7 +22,8 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320", FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, - FeatureUseWzrToVecMove]>; + FeatureUseWzrToVecMove, + FeatureUseFixedOverScalableIfEqualCost]>; def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", "Cortex-A53 ARM processors", [ @@ -45,7 +46,8 @@ def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510", FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, - FeatureUseWzrToVecMove + FeatureUseWzrToVecMove, + FeatureUseFixedOverScalableIfEqualCost ]>; def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520", @@ -53,7 +55,8 @@ def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520", FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, - FeatureUseWzrToVecMove]>; + FeatureUseWzrToVecMove, + FeatureUseFixedOverScalableIfEqualCost]>; def TuneA520AE : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520", "Cortex-A520AE ARM processors", [ @@ -756,7 +759,6 @@ def ProcessorFeatures { FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE, - FeatureUseFixedOverScalableIfEqualCost, FeatureRAS, FeatureRCPC, FeatureRDM, FeatureFPAC]; list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVEBitPerm, @@ -766,7 +768,6 @@ def ProcessorFeatures { FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, FeatureJS, FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM, - FeatureUseFixedOverScalableIfEqualCost, FeatureDotProd, FeatureFPAC]; list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVEBitPerm, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 010d0aaa..2155ace 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -125,7 +125,7 @@ struct AArch64OutgoingValueAssigner bool UseVarArgsCCForFixed = IsCalleeWin && State.isVarArg(); bool Res; - if (Info.IsFixed && !UseVarArgsCCForFixed) { + if (!Flags.isVarArg() && !UseVarArgsCCForFixed) { if (!IsReturn) applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT); Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); @@ -361,7 +361,7 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler { unsigned MaxSize = MemTy.getSizeInBytes() * 8; // For varargs, we always want to extend them to 8 bytes, in which case // we disable setting a max. - if (!Arg.IsFixed) + if (Arg.Flags[0].isVarArg()) MaxSize = 0; Register ValVReg = Arg.Regs[RegIndex]; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index d84f512..f266398 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1013,6 +1013,14 @@ def FeatureAgentScopeFineGrainedRemoteMemoryAtomics "device memory." >; +def FeatureEmulatedSystemScopeAtomics + : SubtargetFeature<"emulated-system-scope-atomics", + "HasEmulatedSystemScopeAtomics", + "true", + "System scope atomics unsupported by the PCI-e are emulated in HW via CAS " + "loop and functional." +>; + def FeatureDefaultComponentZero : SubtargetFeature<"default-component-zero", "HasDefaultComponentZero", "true", @@ -2062,6 +2070,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureAtomicFMinFMaxF64FlatInsts, FeatureFlatBufferGlobalAtomicFaddF64Inst, FeatureMemoryAtomicFAddF32DenormalSupport, + FeatureEmulatedSystemScopeAtomics, FeatureGloballyAddressableScratch, FeatureKernargPreload, FeatureVmemPrefInsts, @@ -2603,6 +2612,10 @@ def HasPkMinMax3Insts : Predicate<"Subtarget->hasPkMinMax3Insts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; +def HasSGetShaderCyclesInst : + Predicate<"Subtarget->hasSGetShaderCyclesInst()">, + AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">, AssemblerPredicate<(all_of FeatureImageInsts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index a0c99b0..846a0b6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -991,10 +991,21 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { return true; if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { - if (Intrinsic->getIntrinsicID() == Intrinsic::read_register) + Intrinsic::ID IID = Intrinsic->getIntrinsicID(); + switch (IID) { + case Intrinsic::read_register: return isReadRegisterSourceOfDivergence(Intrinsic); - - return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID()); + case Intrinsic::amdgcn_addrspacecast_nonnull: { + unsigned SrcAS = + Intrinsic->getOperand(0)->getType()->getPointerAddressSpace(); + unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace(); + return SrcAS == AMDGPUAS::PRIVATE_ADDRESS && + DstAS == AMDGPUAS::FLAT_ADDRESS && + ST->hasGloballyAddressableScratch(); + } + default: + return AMDGPU::isIntrinsicSourceOfDivergence(IID); + } } // Assume all function calls are a source of divergence. @@ -1008,6 +1019,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { if (isa<InvokeInst>(V)) return true; + // If the target supports globally addressable scratch, the mapping from + // scratch memory to the flat aperture changes therefore an address space cast + // is no longer uniform. + if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) { + return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && + CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS && + ST->hasGloballyAddressableScratch(); + } + return false; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 5530886..f47ddf5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -187,6 +187,7 @@ protected: bool HasFlatBufferGlobalAtomicFaddF64Inst = false; bool HasDefaultComponentZero = false; bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false; + bool HasEmulatedSystemScopeAtomics = false; bool HasDefaultComponentBroadcast = false; bool HasXF32Insts = false; /// The maximum number of instructions that may be placed within an S_CLAUSE, @@ -950,6 +951,12 @@ public: return HasAgentScopeFineGrainedRemoteMemoryAtomics; } + /// \return true is HW emulates system scope atomics unsupported by the PCI-e + /// via CAS loop. + bool hasEmulatedSystemScopeAtomics() const { + return HasEmulatedSystemScopeAtomics; + } + bool hasDefaultComponentZero() const { return HasDefaultComponentZero; } bool hasDefaultComponentBroadcast() const { @@ -1081,7 +1088,7 @@ public: } bool hasLDSFPAtomicAddF32() const { return GFX8Insts; } - bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; } + bool hasLDSFPAtomicAddF64() const { return GFX90AInsts || GFX1250Insts; } /// \returns true if the subtarget has the v_permlanex16_b32 instruction. bool hasPermLaneX16() const { return getGeneration() >= GFX10; } @@ -1555,12 +1562,16 @@ public: // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions. bool hasPkMinMax3Insts() const { return GFX1250Insts; } + // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction. + bool hasSGetShaderCyclesInst() const { return GFX1250Insts; } + // \returns true if target has S_SETPRIO_INC_WG instruction. bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; } // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead - // of sign-extending. - bool hasGetPCZeroExtension() const { return GFX12Insts; } + // of sign-extending. Note that GFX1250 has not only fixed the bug but also + // extended VA to 57 bits. + bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; } /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index deadb7a..2d0102f 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -536,6 +536,10 @@ enum Id { // HwRegCode, (6) [5:0] ID_SQ_PERF_SNAPSHOT_DATA1 = 22, ID_SQ_PERF_SNAPSHOT_PC_LO = 23, ID_SQ_PERF_SNAPSHOT_PC_HI = 24, + + // GFX1250 + ID_XNACK_STATE_PRIV = 33, + ID_XNACK_MASK_gfx1250 = 34, }; enum Offset : unsigned { // Offset, (5) [10:6] diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 63826b7..8f44c03 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17695,6 +17695,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() && RMW->hasMetadata("amdgpu.no.remote.memory")) return true; + if (Subtarget.hasEmulatedSystemScopeAtomics()) + return true; } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics()) return true; @@ -17942,8 +17944,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { case AtomicRMWInst::UMax: { if (AMDGPU::isFlatGlobalAddrSpace(AS) || AS == AMDGPUAS::BUFFER_FAT_POINTER) { - // Always expand system scope min/max atomics. - if (HasSystemScope) + if (HasSystemScope && !Subtarget->hasEmulatedSystemScopeAtomics()) return AtomicExpansionKind::CmpXChg; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 3f61bbd..f20b22d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6122,10 +6122,11 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, !Op.isIdenticalTo(*MO)) return false; - // Do not fold a frame index into an instruction that already has a frame - // index. The frame index handling code doesn't handle fixing up operand - // constraints if there are multiple indexes. - if (Op.isFI() && MO->isFI()) + // Do not fold a non-inlineable and non-register operand into an + // instruction that already has a frame index. The frame index handling + // code could not handle well when a frame index co-exists with another + // non-register operand, unless that operand is an inlineable immediate. + if (Op.isFI()) return false; } } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() && @@ -10073,7 +10074,30 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, InstructionUniformity SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); unsigned opcode = MI.getOpcode(); + + auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg() + : MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src); + unsigned DstAS = DstTy.getAddressSpace(); + unsigned SrcAS = SrcTy.getAddressSpace(); + return SrcAS == AMDGPUAS::PRIVATE_ADDRESS && + DstAS == AMDGPUAS::FLAT_ADDRESS && + ST.hasGloballyAddressableScratch() + ? InstructionUniformity::NeverUniform + : InstructionUniformity::Default; + }; + + // If the target supports globally addressable scratch, the mapping from + // scratch memory to the flat aperture changes therefore an address space cast + // is no longer uniform. + if (opcode == TargetOpcode::G_ADDRSPACE_CAST) + return HandleAddrSpaceCast(MI); + if (auto *GI = dyn_cast<GIntrinsic>(&MI)) { auto IID = GI->getIntrinsicID(); if (AMDGPU::isIntrinsicSourceOfDivergence(IID)) @@ -10082,6 +10106,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { return InstructionUniformity::AlwaysUniform; switch (IID) { + case Intrinsic::amdgcn_addrspacecast_nonnull: + return HandleAddrSpaceCast(MI); case Intrinsic::amdgcn_if: case Intrinsic::amdgcn_else: // FIXME: Uniform if second result diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 8303410..431d73b 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1653,6 +1653,12 @@ def S_SETPRIO_INC_WG : SOPP_Pseudo <"s_setprio_inc_wg", (ins i16imm:$simm16), "$ let SubtargetPredicate = HasSetPrioIncWgInst; } +def S_GET_SHADER_CYCLES_U64 : SOP1_64_0 <"s_get_shader_cycles_u64", + [(set i64:$sdst, (readcyclecounter))]> { + let SubtargetPredicate = HasSGetShaderCyclesInst; + let hasSideEffects = 1; +} + let Uses = [EXEC, M0] in { def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsg:$simm16), "$simm16", [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]> { @@ -2145,6 +2151,7 @@ defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>; defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>; // GFX1250 +defm S_GET_SHADER_CYCLES_U64 : SOP1_Real_gfx12<0x06>; defm S_ADD_PC_I64 : SOP1_Real_gfx12<0x04b>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index e433b85..3d9455f 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -223,6 +223,10 @@ static constexpr CustomOperand Operands[] = { {{"HW_REG_SQ_PERF_SNAPSHOT_PC_LO"}, ID_SQ_PERF_SNAPSHOT_PC_LO, isGFX940}, {{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI, isGFX940}, + // GFX1250 + {{"HW_REG_XNACK_STATE_PRIV"}, ID_XNACK_STATE_PRIV, isGFX1250}, + {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK_gfx1250, isGFX1250}, + // Aliases {{"HW_REG_HW_ID"}, ID_HW_ID1, isGFX10}, }; diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp index 3955f2a..25ad9ec 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -669,7 +669,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, default: { // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows // us to fold the constant into the cmp instruction. - RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT); + RHS = DAG.getSignedConstant(C->getSExtValue() + 1, DL, VT); CC = ISD::SETGE; break; } @@ -713,7 +713,10 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to // fold the constant into the cmp instruction. if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { - RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT); + // Doing a "icmp ugt i16 65535, %0" comparison should have been converted + // already to something else. Assert to make sure this assumption holds. + assert((!C->isAllOnes()) && "integer overflow in comparison transform"); + RHS = DAG.getConstant(C->getZExtValue() + 1, DL, VT); CC = ISD::SETUGE; break; } diff --git a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp index 73abfe7..306db6a 100644 --- a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp +++ b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp @@ -87,17 +87,50 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) { for (LoadInst *LI : LoadsToProcess) { Value *V = LI->getPointerOperand(); - auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()); + auto *GV = dyn_cast<GlobalVariable>(V); // If we didn't find the global, we may need to walk through a level of // indirection. This generally happens at -O0. - if (!GV) + if (!GV) { if (auto *NestedLI = dyn_cast<LoadInst>(V)) { BasicBlock::iterator BBI(NestedLI); Value *Loaded = FindAvailableLoadedValue( NestedLI, NestedLI->getParent(), BBI, 0, nullptr, nullptr); GV = dyn_cast_or_null<GlobalVariable>(Loaded); + } else if (auto *NestedAlloca = dyn_cast<AllocaInst>(V)) { + for (auto &Use : NestedAlloca->uses()) { + auto *Store = dyn_cast<StoreInst>(Use.getUser()); + if (!Store) + continue; + + Value *StoredVal = Store->getValueOperand(); + if (!StoredVal) + continue; + + // Try direct global match + GV = dyn_cast<GlobalVariable>(StoredVal); + if (GV) + break; + + // If it's a load, check its source + if (auto *Load = dyn_cast<LoadInst>(StoredVal)) { + GV = dyn_cast<GlobalVariable>(Load->getPointerOperand()); + if (GV) + break; + + // If loading from an unmodified stack copy of the global, reuse the + // global's value. Note: we are just repeating what we are doing for + // the load case for the alloca store pattern. + BasicBlock::iterator BBI(Load); + Value *Loaded = FindAvailableLoadedValue(Load, Load->getParent(), + BBI, 0, nullptr, nullptr); + GV = dyn_cast<GlobalVariable>(Loaded); + if (GV) + break; + } + } } + } auto It = HandleMap.find(GV); if (It == HandleMap.end()) { diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index ffd900c..5153d24 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -56,6 +56,8 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( case Intrinsic::dx_wave_reduce_sum: case Intrinsic::dx_wave_reduce_umax: case Intrinsic::dx_wave_reduce_usum: + case Intrinsic::dx_imad: + case Intrinsic::dx_umad: return true; default: return false; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index a5bf0e5..6583a0f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -6729,8 +6729,7 @@ static bool CC_LoongArchAssign2GRLen(unsigned GRLen, CCState &State, static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI, unsigned ValNo, MVT ValVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, - Type *OrigTy) { + CCState &State, bool IsRet, Type *OrigTy) { unsigned GRLen = DL.getLargestLegalIntTypeSizeInBits(); assert((GRLen == 32 || GRLen == 64) && "Unspport GRLen"); MVT GRLenVT = GRLen == 32 ? MVT::i32 : MVT::i64; @@ -6752,7 +6751,7 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI, case LoongArchABI::ABI_LP64F: case LoongArchABI::ABI_ILP32D: case LoongArchABI::ABI_LP64D: - UseGPRForFloat = !IsFixed; + UseGPRForFloat = ArgFlags.isVarArg(); break; case LoongArchABI::ABI_ILP32S: case LoongArchABI::ABI_LP64S: @@ -6766,7 +6765,8 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI, // will not be passed by registers if the original type is larger than // 2*GRLen, so the register alignment rule does not apply. unsigned TwoGRLenInBytes = (2 * GRLen) / 8; - if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoGRLenInBytes && + if (ArgFlags.isVarArg() && + ArgFlags.getNonZeroOrigAlign() == TwoGRLenInBytes && DL.getTypeAllocSize(OrigTy) == TwoGRLenInBytes) { unsigned RegIdx = State.getFirstUnallocated(ArgGPRs); // Skip 'odd' register if necessary. @@ -6916,7 +6916,7 @@ void LoongArchTargetLowering::analyzeInputArgs( LoongArchABI::ABI ABI = MF.getSubtarget<LoongArchSubtarget>().getTargetABI(); if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Ins[i].Flags, - CCInfo, /*IsFixed=*/true, IsRet, ArgTy)) { + CCInfo, IsRet, ArgTy)) { LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type " << ArgVT << '\n'); llvm_unreachable(""); @@ -6934,7 +6934,7 @@ void LoongArchTargetLowering::analyzeOutputArgs( LoongArchABI::ABI ABI = MF.getSubtarget<LoongArchSubtarget>().getTargetABI(); if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Outs[i].Flags, - CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) { + CCInfo, IsRet, OrigTy)) { LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type " << ArgVT << "\n"); llvm_unreachable(""); @@ -7647,8 +7647,7 @@ bool LoongArchTargetLowering::CanLowerReturn( LoongArchABI::ABI ABI = MF.getSubtarget<LoongArchSubtarget>().getTargetABI(); if (CC_LoongArch(MF.getDataLayout(), ABI, i, Outs[i].VT, CCValAssign::Full, - Outs[i].Flags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, - nullptr)) + Outs[i].Flags, CCInfo, /*IsRet=*/true, nullptr)) return false; } return true; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 6b49a98f..f79ba74 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -330,7 +330,7 @@ private: unsigned ValNo, MVT ValVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, - bool IsFixed, bool IsRet, Type *OrigTy); + bool IsRet, Type *OrigTy); void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet, diff --git a/llvm/lib/Target/Mips/MipsCCState.cpp b/llvm/lib/Target/Mips/MipsCCState.cpp index 9e8cd2e..13237c5 100644 --- a/llvm/lib/Target/Mips/MipsCCState.cpp +++ b/llvm/lib/Target/Mips/MipsCCState.cpp @@ -128,12 +128,10 @@ void MipsCCState::PreAnalyzeReturnValue(EVT ArgVT) { OriginalRetWasFloatVector.push_back(originalEVTTypeIsVectorFloat(ArgVT)); } -void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed, - const char *Func) { +void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, const char *Func) { OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, Func)); OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy()); OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy()); - CallOperandIsFixed.push_back(IsFixed); } /// Identify lowered values that originated from f128, float and sret to vXfXX @@ -148,7 +146,6 @@ void MipsCCState::PreAnalyzeCallOperands( OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg.Ty, Func)); OriginalArgWasFloat.push_back(FuncArg.Ty->isFloatingPointTy()); OriginalArgWasFloatVector.push_back(FuncArg.Ty->isVectorTy()); - CallOperandIsFixed.push_back(Outs[i].IsFixed); } } diff --git a/llvm/lib/Target/Mips/MipsCCState.h b/llvm/lib/Target/Mips/MipsCCState.h index 4229da5..30b68e8 100644 --- a/llvm/lib/Target/Mips/MipsCCState.h +++ b/llvm/lib/Target/Mips/MipsCCState.h @@ -36,7 +36,7 @@ public: static bool originalEVTTypeIsVectorFloat(EVT Ty); static bool originalTypeIsVectorFloat(const Type *Ty); - void PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed, const char *Func); + void PreAnalyzeCallOperand(const Type *ArgTy, const char *Func); void PreAnalyzeFormalArgument(const Type *ArgTy, ISD::ArgFlagsTy Flags); void PreAnalyzeReturnValue(EVT ArgVT); @@ -86,10 +86,6 @@ private: /// vector. SmallVector<bool, 4> OriginalRetWasFloatVector; - /// Records whether the value was a fixed argument. - /// See ISD::OutputArg::IsFixed, - SmallVector<bool, 4> CallOperandIsFixed; - // Used to handle MIPS16-specific calling convention tweaks. // FIXME: This should probably be a fully fledged calling convention. SpecialCallingConvType SpecialCallingConv; @@ -106,7 +102,6 @@ public: OriginalArgWasF128.clear(); OriginalArgWasFloat.clear(); OriginalArgWasFloatVector.clear(); - CallOperandIsFixed.clear(); PreAnalyzeCallOperands(Outs, FuncArgs, Func); } @@ -213,7 +208,6 @@ public: bool WasOriginalRetVectorFloat(unsigned ValNo) const { return OriginalRetWasFloatVector[ValNo]; } - bool IsCallOperandFixed(unsigned ValNo) { return CallOperandIsFixed[ValNo]; } SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; } }; } diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp index 555773a..fa49108 100644 --- a/llvm/lib/Target/Mips/MipsCallLowering.cpp +++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp @@ -47,7 +47,7 @@ struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner { if (IsReturn) State.PreAnalyzeReturnValue(EVT::getEVT(Info.Ty)); else - State.PreAnalyzeCallOperand(Info.Ty, Info.IsFixed, Func); + State.PreAnalyzeCallOperand(Info.Ty, Func); return CallLowering::OutgoingValueAssigner::assignArg( ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State); diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td index 39e184a..0e5c16c 100644 --- a/llvm/lib/Target/Mips/MipsCallingConv.td +++ b/llvm/lib/Target/Mips/MipsCallingConv.td @@ -29,12 +29,6 @@ class CCIfOrigArgWasFloat<CCAction A> class CCIfOrigArgWasF128<CCAction A> : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)", A>; -/// Match if this specific argument is a vararg. -/// This is slightly different fro CCIfIsVarArg which matches if any argument is -/// a vararg. -class CCIfArgIsVarArg<CCAction A> - : CCIf<"!static_cast<MipsCCState *>(&State)->IsCallOperandFixed(ValNo)", A>; - /// Match if the return was a floating point vector. class CCIfOrigArgWasNotVectorFloat<CCAction A> : CCIf<"!static_cast<MipsCCState *>(&State)" @@ -344,7 +338,7 @@ def CC_Mips_VarArg : CallingConv<[ ]>; def CC_Mips : CallingConv<[ - CCIfVarArg<CCIfArgIsVarArg<CCDelegateTo<CC_Mips_VarArg>>>, + CCIfVarArg<CCIfArgVarArg<CCDelegateTo<CC_Mips_VarArg>>>, CCDelegateTo<CC_Mips_FixedArg> ]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 15f45a1..d4f0cc9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -900,6 +900,17 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, if (STI.allowFP16Math() || STI.hasBF16Math()) setTargetDAGCombine(ISD::SETCC); + // Vector reduction operations. These may be turned into shuffle or tree + // reductions depending on what instructions are available for each type. + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { + MVT EltVT = VT.getVectorElementType(); + if (EltVT == MVT::f32 || EltVT == MVT::f64) { + setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN, + ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM}, + VT, Custom); + } + } + // Promote fp16 arithmetic if fp16 hardware isn't available or the // user passed --nvptx-no-fp16-math. The flag is useful because, // although sm_53+ GPUs have some sort of FP16 support in @@ -1143,6 +1154,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::BFI) MAKE_CASE(NVPTXISD::PRMT) MAKE_CASE(NVPTXISD::FCOPYSIGN) + MAKE_CASE(NVPTXISD::FMAXNUM3) + MAKE_CASE(NVPTXISD::FMINNUM3) + MAKE_CASE(NVPTXISD::FMAXIMUM3) + MAKE_CASE(NVPTXISD::FMINIMUM3) MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC) MAKE_CASE(NVPTXISD::STACKRESTORE) MAKE_CASE(NVPTXISD::STACKSAVE) @@ -1929,6 +1944,124 @@ static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL, return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode); } +/// Reduces the elements using the scalar operations provided. The operations +/// are sorted descending in number of inputs they take. The flags on the +/// original reduction operation will be propagated to each scalar operation. +/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction +/// used in ExpandReductions and SelectionDAG. +static SDValue buildTreeReduction( + const SmallVector<SDValue> &Elements, EVT EltTy, + ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops, + const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) { + // Build the reduction tree at each level, starting with all the elements. + SmallVector<SDValue> Level = Elements; + + unsigned OpIdx = 0; + while (Level.size() > 1) { + // Try to reduce this level using the current operator. + const auto [Op, NumInputs] = Ops[OpIdx]; + + // Build the next level by partially reducing all elements. + SmallVector<SDValue> ReducedLevel; + unsigned I = 0, E = Level.size(); + for (; I + NumInputs <= E; I += NumInputs) { + // Reduce elements in groups of [NumInputs], as much as possible. + ReducedLevel.push_back(DAG.getNode( + Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags)); + } + + if (I < E) { + // Handle leftover elements. + + if (ReducedLevel.empty()) { + // We didn't reduce anything at this level. We need to pick a smaller + // operator. + ++OpIdx; + assert(OpIdx < Ops.size() && "no smaller operators for reduction"); + continue; + } + + // We reduced some things but there's still more left, meaning the + // operator's number of inputs doesn't evenly divide this level size. Move + // these elements to the next level. + for (; I < E; ++I) + ReducedLevel.push_back(Level[I]); + } + + // Process the next level. + Level = ReducedLevel; + } + + return *Level.begin(); +} + +// Get scalar reduction opcode +static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) { + switch (ReductionOpcode) { + case ISD::VECREDUCE_FMAX: + return ISD::FMAXNUM; + case ISD::VECREDUCE_FMIN: + return ISD::FMINNUM; + case ISD::VECREDUCE_FMAXIMUM: + return ISD::FMAXIMUM; + case ISD::VECREDUCE_FMINIMUM: + return ISD::FMINIMUM; + default: + llvm_unreachable("unhandled reduction opcode"); + } +} + +/// Get 3-input scalar reduction opcode +static std::optional<NVPTXISD::NodeType> +getScalar3OpcodeForReduction(unsigned ReductionOpcode) { + switch (ReductionOpcode) { + case ISD::VECREDUCE_FMAX: + return NVPTXISD::FMAXNUM3; + case ISD::VECREDUCE_FMIN: + return NVPTXISD::FMINNUM3; + case ISD::VECREDUCE_FMAXIMUM: + return NVPTXISD::FMAXIMUM3; + case ISD::VECREDUCE_FMINIMUM: + return NVPTXISD::FMINIMUM3; + default: + return std::nullopt; + } +} + +/// Lower reductions to either a sequence of operations or a tree if +/// reassociations are allowed. This method will use larger operations like +/// max3/min3 when the target supports them. +SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + const SDNodeFlags Flags = Op->getFlags(); + SDValue Vector = Op.getOperand(0); + + const unsigned Opcode = Op->getOpcode(); + const EVT EltTy = Vector.getValueType().getVectorElementType(); + + // Whether we can use 3-input min/max when expanding the reduction. + const bool CanUseMinMax3 = + EltTy == MVT::f32 && STI.getSmVersion() >= 100 && + STI.getPTXVersion() >= 88 && + (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN || + Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM); + + // A list of SDNode opcodes with equivalent semantics, sorted descending by + // number of inputs they take. + SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps; + + if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode); + CanUseMinMax3 && Opcode3Elem) + ScalarOps.push_back({*Opcode3Elem, 3}); + ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2}); + + SmallVector<SDValue> Elements; + DAG.ExtractVectorElements(Vector, Elements); + + return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG); +} + SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { // Handle bitcasting from v2i8 without hitting the default promotion // strategy which goes through stack memory. @@ -2808,6 +2941,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: + return LowerVECREDUCE(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::LOAD: @@ -3908,6 +4046,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( return true; } + case Intrinsic::nvvm_prefetch_tensormap: { + auto &DL = I.getDataLayout(); + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = getPointerTy(DL); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable; + Info.align.reset(); + return true; + } + case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: case Intrinsic::nvvm_ldu_global_p: { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index cf72a1e..43e721a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -64,6 +64,11 @@ enum NodeType : unsigned { UNPACK_VECTOR, FCOPYSIGN, + FMAXNUM3, + FMINNUM3, + FMAXIMUM3, + FMINIMUM3, + DYNAMIC_STACKALLOC, STACKRESTORE, STACKSAVE, @@ -286,6 +291,7 @@ private: SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index aac611d..1ab41bf 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -347,6 +347,36 @@ multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> { Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>; } +// Template for 3-input minimum/maximum instructions +// (sm_100+/PTX 8.8 and f32 only) +// +// Also defines ftz (flush subnormal inputs and results to sign-preserving +// zero) variants for fp32 functions. +multiclass FMINIMUMMAXIMUM3<string OpcStr, bit NaN, SDNode OpNode> { + defvar nan_str = !if(NaN, ".NaN", ""); + def f32rrr : + BasicFlagsNVPTXInst<(outs B32:$dst), + (ins B32:$a, B32:$b, B32:$c), + (ins FTZFlag:$ftz), + OpcStr # "$ftz" # nan_str # ".f32", + [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>, + Requires<[hasPTX<88>, hasSM<100>]>; + def f32rri : + BasicFlagsNVPTXInst<(outs B32:$dst), + (ins B32:$a, B32:$b, f32imm:$c), + (ins FTZFlag:$ftz), + OpcStr # "$ftz" # nan_str # ".f32", + [(set f32:$dst, (OpNode f32:$a, f32:$b, fpimm:$c))]>, + Requires<[hasPTX<88>, hasSM<100>]>; + def f32rii : + BasicFlagsNVPTXInst<(outs B32:$dst), + (ins B32:$a, f32imm:$b, f32imm:$c), + (ins FTZFlag:$ftz), + OpcStr # "$ftz" # nan_str # ".f32", + [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>, + Requires<[hasPTX<88>, hasSM<100>]>; +} + // Template for instructions which take three FP args. The // instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64"). // @@ -900,6 +930,20 @@ defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>; defm MIN_NAN : FMINIMUMMAXIMUM<"min", /* NaN */ true, fminimum>; defm MAX_NAN : FMINIMUMMAXIMUM<"max", /* NaN */ true, fmaximum>; +def nvptx_fminnum3 : SDNode<"NVPTXISD::FMINNUM3", SDTFPTernaryOp, + [SDNPCommutative]>; +def nvptx_fmaxnum3 : SDNode<"NVPTXISD::FMAXNUM3", SDTFPTernaryOp, + [SDNPCommutative]>; +def nvptx_fminimum3 : SDNode<"NVPTXISD::FMINIMUM3", SDTFPTernaryOp, + [SDNPCommutative]>; +def nvptx_fmaximum3 : SDNode<"NVPTXISD::FMAXIMUM3", SDTFPTernaryOp, + [SDNPCommutative]>; + +defm FMIN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ false, nvptx_fminnum3>; +defm FMAX3 : FMINIMUMMAXIMUM3<"max", /* NaN */ false, nvptx_fmaxnum3>; +defm FMINNAN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ true, nvptx_fminimum3>; +defm FMAXNAN3 : FMINIMUMMAXIMUM3<"max", /* NaN */ true, nvptx_fmaximum3>; + defm FABS : F2<"abs", fabs>; defm FNEG : F2<"neg", fneg>; defm FABS_H: F2_Support_Half<"abs", fabs>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index d337192..d4a0ca7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -39,6 +39,12 @@ def AS_match { code global = [{ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL); }]; + code const = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST); + }]; + code param = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM); + }]; } @@ -950,33 +956,47 @@ foreach dim = 3...5 in { defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4", [hasTMACTAGroupSupport]>; -//Prefetch and Prefetchu - -let Predicates = [hasPTX<80>, hasSM<90>] in { - class PREFETCH_INTRS<string InstName> : - BasicNVPTXInst<(outs), (ins ADDR:$addr), - InstName, - [(!cast<Intrinsic>(!strconcat("int_nvvm_", - !subst(".", "_", InstName))) addr:$addr)]>; +//Prefetchu and Prefetch - def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">; - def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">; - def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">; - def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1">; - def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">; - def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">; +defvar frag_pat = (int_nvvm_prefetch_tensormap node:$addr); - def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr), - "prefetch.global.L2::evict_normal", - [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>; +multiclass PREFETCH_TENSORMAP_PATFRAG<string suffix, code predicate> { + def !tolower(suffix) : PatFrag<!setdagop(frag_pat, ops), frag_pat, predicate>; +} - def PREFETCH_GLOBAL_L2_EVICT_LAST : BasicNVPTXInst<(outs), (ins ADDR:$addr), - "prefetch.global.L2::evict_last", - [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>; +defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"CONST", AS_match.const>; +defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"GENERIC", AS_match.generic>; +defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"PARAM", AS_match.param>; - def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">; +multiclass PREFETCH_TENSORMAP_INST<string addrspace_name, PatFrag pattern_frag> { + def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "prefetch" # addrspace_name # ".tensormap", + [(pattern_frag addr:$addr)]>, + Requires<[hasPTX<80>, hasSM<90>]>; } +defm PREFETCH_CONST_TENSORMAP : PREFETCH_TENSORMAP_INST<".const", prefetch_tensormap_const>; +defm PREFETCH_GENERIC_TENSORMAP : PREFETCH_TENSORMAP_INST<"", prefetch_tensormap_generic>; +defm PREFETCH_PARAM_TENSORMAP : PREFETCH_TENSORMAP_INST<".param", prefetch_tensormap_param>; + +class PREFETCH_INTRS<string InstName, Intrinsic Intr> : + BasicNVPTXInst<(outs), (ins ADDR:$addr), + InstName, + [(Intr addr:$addr)]>, + Requires<[hasPTX<80>, hasSM<90>]>; + +def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1", int_nvvm_prefetchu_L1>; +def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1", int_nvvm_prefetch_L1>; +def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2", int_nvvm_prefetch_L2>; +def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1", int_nvvm_prefetch_global_L1>; +def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1", int_nvvm_prefetch_local_L1>; +def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2", int_nvvm_prefetch_global_L2>; +def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2", int_nvvm_prefetch_local_L2>; +def PREFETCH_GLOBAL_L2_EVICT_NORMAL : PREFETCH_INTRS<"prefetch.global.L2::evict_normal", + int_nvvm_prefetch_global_L2_evict_normal>; +def PREFETCH_GLOBAL_L2_EVICT_LAST : PREFETCH_INTRS<"prefetch.global.L2::evict_last", + int_nvvm_prefetch_global_L2_evict_last>; + //Applypriority intrinsics class APPLYPRIORITY_L2_INTRS<string addrspace> : BasicNVPTXInst<(outs), (ins ADDR:$addr, B64:$size), diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 3ae2d9d..f4f8961 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -564,7 +564,8 @@ bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, case Intrinsic::nvvm_isspacep_global: case Intrinsic::nvvm_isspacep_local: case Intrinsic::nvvm_isspacep_shared: - case Intrinsic::nvvm_isspacep_shared_cluster: { + case Intrinsic::nvvm_isspacep_shared_cluster: + case Intrinsic::nvvm_prefetch_tensormap: { OpIndexes.push_back(0); return true; } @@ -587,6 +588,11 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, return ConstantInt::get(II->getType(), *R); return nullptr; } + case Intrinsic::nvvm_prefetch_tensormap: { + IRBuilder<> Builder(II); + return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap, + NewV); + } } return nullptr; } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 9a6e261..b32d931b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -87,6 +87,13 @@ public: } unsigned getMinVectorRegisterBitWidth() const override { return 32; } + bool shouldExpandReduction(const IntrinsicInst *II) const override { + // Turn off ExpandReductions pass for NVPTX, which doesn't have advanced + // swizzling operations. Our backend/Selection DAG can expand these + // reductions with less movs. + return false; + } + // We don't want to prevent inlining because of target-cpu and -features // attributes that were added to newer versions of LLVM/Clang: There are // no incompatible functions in PTX, ptxas will throw errors in such cases. diff --git a/llvm/lib/Target/PowerPC/PPCCCState.h b/llvm/lib/Target/PowerPC/PPCCCState.h index b0e50b2..feab9c5 100644 --- a/llvm/lib/Target/PowerPC/PPCCCState.h +++ b/llvm/lib/Target/PowerPC/PPCCCState.h @@ -38,36 +38,6 @@ public: void clearWasPPCF128() { OriginalArgWasPPCF128.clear(); } }; -class AIXCCState : public CCState { -private: - BitVector IsFixed; - -public: - AIXCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF, - SmallVectorImpl<CCValAssign> &Locs, LLVMContext &C) - : CCState(CC, IsVarArg, MF, Locs, C) {} - - void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins, - CCAssignFn Fn) { - // All formal arguments are fixed. - IsFixed.resize(Ins.size(), true); - CCState::AnalyzeFormalArguments(Ins, Fn); - } - - void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, - CCAssignFn Fn) { - // Record whether the call operand was a fixed argument. - IsFixed.resize(Outs.size(), false); - for (unsigned ValNo = 0, E = Outs.size(); ValNo != E; ++ValNo) - if (Outs[ValNo].IsFixed) - IsFixed.set(ValNo); - - CCState::AnalyzeCallOperands(Outs, Fn); - } - - bool isFixed(unsigned ValNo) const { return IsFixed.test(ValNo); } -}; - } // end namespace llvm #endif diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 47f0038..2698bd6 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -3925,9 +3925,6 @@ SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { - if (Subtarget.isAIXABI()) - report_fatal_error("ADJUST_TRAMPOLINE operation is not supported on AIX."); - return Op.getOperand(0); } @@ -3984,9 +3981,6 @@ SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const { SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { - if (Subtarget.isAIXABI()) - report_fatal_error("INIT_TRAMPOLINE operation is not supported on AIX."); - SDValue Chain = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function @@ -3994,6 +3988,65 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); + + if (Subtarget.isAIXABI()) { + // On AIX we create a trampoline descriptor by combining the + // entry point and TOC from the global descriptor (FPtr) with the + // nest argument as the environment pointer. + uint64_t PointerSize = Subtarget.isPPC64() ? 8 : 4; + MaybeAlign PointerAlign(PointerSize); + auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors() + ? (MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant) + : MachineMemOperand::MONone; + + uint64_t TOCPointerOffset = 1 * PointerSize; + uint64_t EnvPointerOffset = 2 * PointerSize; + SDValue SDTOCPtrOffset = DAG.getConstant(TOCPointerOffset, dl, PtrVT); + SDValue SDEnvPtrOffset = DAG.getConstant(EnvPointerOffset, dl, PtrVT); + + const Value *TrampolineAddr = + cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); + const Function *Func = + cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); + + SDValue OutChains[3]; + + // Copy the entry point address from the global descriptor to the + // trampoline buffer. + SDValue LoadEntryPoint = + DAG.getLoad(PtrVT, dl, Chain, FPtr, MachinePointerInfo(Func, 0), + PointerAlign, MMOFlags); + SDValue EPLoadChain = LoadEntryPoint.getValue(1); + OutChains[0] = DAG.getStore(EPLoadChain, dl, LoadEntryPoint, Trmp, + MachinePointerInfo(TrampolineAddr, 0)); + + // Copy the TOC pointer from the global descriptor to the trampoline + // buffer. + SDValue TOCFromDescriptorPtr = + DAG.getNode(ISD::ADD, dl, PtrVT, FPtr, SDTOCPtrOffset); + SDValue TOCReg = DAG.getLoad(PtrVT, dl, Chain, TOCFromDescriptorPtr, + MachinePointerInfo(Func, TOCPointerOffset), + PointerAlign, MMOFlags); + SDValue TrampolineTOCPointer = + DAG.getNode(ISD::ADD, dl, PtrVT, Trmp, SDTOCPtrOffset); + SDValue TOCLoadChain = TOCReg.getValue(1); + OutChains[1] = + DAG.getStore(TOCLoadChain, dl, TOCReg, TrampolineTOCPointer, + MachinePointerInfo(TrampolineAddr, TOCPointerOffset)); + + // Store the nest argument into the environment pointer in the trampoline + // buffer. + SDValue EnvPointer = DAG.getNode(ISD::ADD, dl, PtrVT, Trmp, SDEnvPtrOffset); + OutChains[2] = + DAG.getStore(Chain, dl, Nest, EnvPointer, + MachinePointerInfo(TrampolineAddr, EnvPointerOffset)); + + SDValue TokenFactor = + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); + return TokenFactor; + } + bool isPPC64 = (PtrVT == MVT::i64); Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); @@ -6036,7 +6089,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; bool Result; - if (Outs[i].IsFixed) { + if (!ArgFlags.isVarArg()) { Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); } else { @@ -6852,8 +6905,7 @@ static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) { static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &S) { - AIXCCState &State = static_cast<AIXCCState &>(S); + CCState &State) { const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>( State.getMachineFunction().getSubtarget()); const bool IsPPC64 = Subtarget.isPPC64(); @@ -6865,9 +6917,6 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, if (ValVT == MVT::f128) report_fatal_error("f128 is unimplemented on AIX."); - if (ArgFlags.isNest()) - report_fatal_error("Nest arguments are unimplemented."); - static const MCPhysReg GPR_32[] = {// 32-bit registers. PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10}; @@ -6882,6 +6931,14 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, const ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32; + if (ArgFlags.isNest()) { + MCRegister EnvReg = State.AllocateReg(IsPPC64 ? PPC::X11 : PPC::R11); + if (!EnvReg) + report_fatal_error("More then one nest argument."); + State.addLoc(CCValAssign::getReg(ValNo, ValVT, EnvReg, RegVT, LocInfo)); + return false; + } + if (ArgFlags.isByVal()) { const Align ByValAlign(ArgFlags.getNonZeroByValAlign()); if (ByValAlign > StackAlign) @@ -7032,7 +7089,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, // They are passed in VRs if any are available (unlike arguments passed // through ellipses) and shadow GPRs (unlike arguments to non-vaarg // functions) - if (State.isFixed(ValNo)) { + if (!ArgFlags.isVarArg()) { if (MCRegister VReg = State.AllocateReg(VR)) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo)); // Shadow allocate GPRs and stack space even though we pass in a VR. @@ -7220,7 +7277,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); - AIXCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); const EVT PtrVT = getPointerTy(MF.getDataLayout()); // Reserve space for the linkage area on the stack. @@ -7567,8 +7624,8 @@ SDValue PPCTargetLowering::LowerCall_AIX( MachineFunction &MF = DAG.getMachineFunction(); SmallVector<CCValAssign, 16> ArgLocs; - AIXCCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs, - *DAG.getContext()); + CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs, + *DAG.getContext()); // Reserve space for the linkage save area (LSA) on the stack. // In both PPC32 and PPC64 there are 6 reserved slots in the LSA: diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index d2b75a6..34026ed 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -45,8 +45,8 @@ public: CCValAssign::LocInfo LocInfo, const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, CCState &State) override { - if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, Info.IsFixed, - IsRet, Info.Ty)) + if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, IsRet, + Info.Ty)) return true; StackSize = State.getStackSize(); @@ -196,8 +196,8 @@ public: if (LocVT.isScalableVector()) MF.getInfo<RISCVMachineFunctionInfo>()->setIsVectorCall(); - if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, - /*IsFixed=*/true, IsRet, Info.Ty)) + if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, IsRet, + Info.Ty)) return true; StackSize = State.getStackSize(); @@ -454,7 +454,7 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF, for (unsigned I = 0, E = Outs.size(); I < E; ++I) { MVT VT = MVT::getVT(Outs[I].Ty); if (CC_RISCV(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo, - /*IsFixed=*/true, /*isRet=*/true, nullptr)) + /*isRet=*/true, nullptr)) return false; } return true; diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp index cb6117e..70127e3 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp @@ -324,7 +324,7 @@ static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, CCState &State, // Implements the RISC-V calling convention. Returns true upon failure. bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) { + CCState &State, bool IsRet, Type *OrigTy) { const MachineFunction &MF = State.getMachineFunction(); const DataLayout &DL = MF.getDataLayout(); const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>(); @@ -379,12 +379,12 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, break; case RISCVABI::ABI_ILP32F: case RISCVABI::ABI_LP64F: - UseGPRForF16_F32 = !IsFixed; + UseGPRForF16_F32 = ArgFlags.isVarArg(); break; case RISCVABI::ABI_ILP32D: case RISCVABI::ABI_LP64D: - UseGPRForF16_F32 = !IsFixed; - UseGPRForF64 = !IsFixed; + UseGPRForF16_F32 = ArgFlags.isVarArg(); + UseGPRForF64 = ArgFlags.isVarArg(); break; } @@ -465,7 +465,7 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, // currently if we are using ILP32E calling convention. This behavior may be // changed when RV32E/ILP32E is ratified. unsigned TwoXLenInBytes = (2 * XLen) / 8; - if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes && + if (ArgFlags.isVarArg() && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes && DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes && ABI != RISCVABI::ABI_ILP32E) { unsigned RegIdx = State.getFirstUnallocated(ArgGPRs); @@ -620,8 +620,8 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, // benchmark. But theoretically, it may have benefit for some cases. bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State, - bool IsFixed, bool IsRet, Type *OrigTy) { + ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsRet, + Type *OrigTy) { const MachineFunction &MF = State.getMachineFunction(); const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>(); const RISCVTargetLowering &TLI = *Subtarget.getTargetLowering(); diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.h b/llvm/lib/Target/RISCV/RISCVCallingConv.h index bf823b7..2030ce1 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.h +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.h @@ -21,15 +21,15 @@ namespace llvm { typedef bool RISCVCCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, - bool IsFixed, bool IsRet, Type *OrigTy); + bool IsRet, Type *OrigTy); bool CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, Type *OrigTy); + CCState &State, bool IsRet, Type *OrigTy); bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, Type *OrigTy); + CCState &State, bool IsRet, Type *OrigTy); bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 171940e..a7329d2 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1700,6 +1700,18 @@ def TuneNLogNVRGather def TunePostRAScheduler : SubtargetFeature<"use-postra-scheduler", "UsePostRAScheduler", "true", "Schedule again after register allocation">; +def TuneDisableMISchedLoadClustering : SubtargetFeature<"disable-misched-load-clustering", + "EnableMISchedLoadClustering", "false", "Disable load clustering in the machine scheduler">; + +def TuneDisableMISchedStoreClustering : SubtargetFeature<"disable-misched-store-clustering", + "EnableMISchedStoreClustering", "false", "Disable store clustering in the machine scheduler">; + +def TuneDisablePostMISchedLoadClustering : SubtargetFeature<"disable-postmisched-load-clustering", + "EnablePostMISchedLoadClustering", "false", "Disable PostRA load clustering in the machine scheduler">; + +def TuneDisablePostMISchedStoreClustering : SubtargetFeature<"disable-postmisched-store-clustering", + "EnablePostMISchedStoreClustering", "false", "Disable PostRA store clustering in the machine scheduler">; + def TuneDisableLatencySchedHeuristic : SubtargetFeature<"disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", "Disable latency scheduling heuristic">; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 03e54b3..e4aa8b8 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22282,8 +22282,8 @@ void RISCVTargetLowering::analyzeInputArgs( else if (In.isOrigArg()) ArgTy = FType->getParamType(In.getOrigArgIndex()); - if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, - /*IsFixed=*/true, IsRet, ArgTy)) { + if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet, + ArgTy)) { LLVM_DEBUG(dbgs() << "InputArg #" << Idx << " has unhandled type " << ArgVT << '\n'); llvm_unreachable(nullptr); @@ -22300,8 +22300,8 @@ void RISCVTargetLowering::analyzeOutputArgs( ISD::ArgFlagsTy ArgFlags = Out.Flags; Type *OrigTy = CLI ? CLI->getArgs()[Out.OrigArgIndex].Ty : nullptr; - if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, Out.IsFixed, - IsRet, OrigTy)) { + if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet, + OrigTy)) { LLVM_DEBUG(dbgs() << "OutputArg #" << Idx << " has unhandled type " << ArgVT << "\n"); llvm_unreachable(nullptr); @@ -23083,7 +23083,7 @@ bool RISCVTargetLowering::CanLowerReturn( MVT VT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; if (CC_RISCV(i, VT, VT, CCValAssign::Full, ArgFlags, CCInfo, - /*IsFixed=*/true, /*IsRet=*/true, nullptr)) + /*IsRet=*/true, nullptr)) return false; } return true; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 04ffb05..413ad8b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -629,9 +629,6 @@ def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 8)), def : Pat<(and (or (shl GPR:$rs2, (XLenVT 8)), (zexti8 (XLenVT GPR:$rs1))), 0xFFFF), (PACKH GPR:$rs1, GPR:$rs2)>; -def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)), - (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), - (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)), (zexti8 (XLenVT GPR:$rs1))), @@ -642,11 +639,15 @@ let Predicates = [HasStdExtZbkb, IsRV32] in { def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))), (PACK GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (shl GPR:$rs2, (XLenVT 24)), + (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), + (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; + // Match a pattern of 2 bytes being inserted into bits [31:16], with bits // bits [15:0] coming from a zero extended value. We can use pack with packh for // bits [31:16]. If bits [15:0] can also be a packh, it can be matched // separately. -def : Pat<(or (or (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)), +def : Pat<(or (or (shl GPR:$op1rs2, (XLenVT 24)), (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), (zexti16 (XLenVT GPR:$rs1))), (PACK (XLenVT GPR:$rs1), @@ -657,12 +658,40 @@ let Predicates = [HasStdExtZbkb, IsRV64] in { def : Pat<(i64 (or (zexti32 (i64 GPR:$rs1)), (shl GPR:$rs2, (i64 32)))), (PACK GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)), + (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), + (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; +def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (XLenVT 24)), + (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), + (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; + def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (i64 16)), (zexti16 (i64 GPR:$rs1))), (PACKW GPR:$rs1, GPR:$rs2)>; def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32), (zexti16 (i64 GPR:$rs1)))), (PACKW GPR:$rs1, GPR:$rs2)>; + +// Match a pattern of 2 bytes being inserted into bits [31:16], with bits +// bits [15:0] coming from a zero extended value, and bits [63:32] being +// ignored. We can use packw with packh for bits [31:16]. If bits [15:0] can +// also be a packh, it can be matched separately. +def : Pat<(binop_allwusers<or> + (or (shl GPR:$op1rs2, (XLenVT 24)), + (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), + (zexti16 (XLenVT GPR:$rs1))), + (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; +// We need to manually reassociate the patterns because of the binop_allwusers. +def : Pat<(binop_allwusers<or> + (or (zexti16 (XLenVT GPR:$rs1)), + (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), + (shl GPR:$op1rs2, (XLenVT 24))), + (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; +def : Pat<(binop_allwusers<or> + (or (zexti16 (XLenVT GPR:$rs1)), + (shl GPR:$op1rs1, (XLenVT 24))), + (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))), + (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; } // Predicates = [HasStdExtZbkb, IsRV64] let Predicates = [HasStdExtZbb, IsRV32] in diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.td b/llvm/lib/Target/RISCV/RISCVMacroFusion.td index 875a93d..39e099b 100644 --- a/llvm/lib/Target/RISCV/RISCVMacroFusion.td +++ b/llvm/lib/Target/RISCV/RISCVMacroFusion.td @@ -91,3 +91,59 @@ def TuneLDADDFusion CheckIsImmOperand<2>, CheckImmOperand<2, 0> ]>>; + +defvar Load = [LB, LH, LW, LD, LBU, LHU, LWU]; + +// Fuse add(.uw) followed by a load (lb, lh, lw, ld, lbu, lhu, lwu): +// add(.uw) rd, rs1, rs2 +// load rd, imm12(rd) +def TuneADDLoadFusion + : SimpleFusion<"add-load-fusion", "HasADDLoadFusion", "Enable ADD(.UW) + load macrofusion", + CheckOpcode<[ADD, ADD_UW]>, + CheckOpcode<Load>>; + +// Fuse AUIPC followed by by a load (lb, lh, lw, ld, lbu, lhu, lwu) +// auipc rd, imm20 +// load rd, imm12(rd) +def TuneAUIPCLoadFusion + : SimpleFusion<"auipc-load-fusion", "HasAUIPCLoadFusion", + "Enable AUIPC + load macrofusion", + CheckOpcode<[AUIPC]>, + CheckOpcode<Load>>; + +// Fuse LUI followed by a load (lb, lh, lw, ld, lbu, lhu, lwu) +// lui rd, imm[31:12] +// load rd, imm12(rd) +def TuneLUILoadFusion + : SimpleFusion<"lui-load-fusion", "HasLUILoadFusion", + "Enable LUI + load macrofusion", + CheckOpcode<[LUI]>, + CheckOpcode<Load>>; + +// Bitfield extract fusion: similar to TuneShiftedZExtWFusion +// but without the immediate restriction +// slli rd, rs1, imm12 +// srli rd, rd, imm12 +def TuneBFExtFusion + : SimpleFusion<"bfext-fusion", "HasBFExtFusion", + "Enable SLLI+SRLI (bitfield extract) macrofusion", + CheckOpcode<[SLLI]>, + CheckOpcode<[SRLI]>>; + +// Fuse ADDI followed by a load (lb, lh, lw, ld, lbu, lhu, lwu) +// addi rd, rs1, imm12 +// load rd, imm12(rd) +def TuneADDILoadFusion + : SimpleFusion<"addi-load-fusion", "HasADDILoadFusion", + "Enable ADDI + load macrofusion", + CheckOpcode<[ADDI]>, + CheckOpcode<Load>>; + +// Fuse shXadd(.uw) followed by a load (lb, lh, lw, ld, lbu, lhu, lwu) +// shXadd(.uw) rd, rs1, rs2 +// load rd, imm12(rd) +def TuneSHXADDLoadFusion + : SimpleFusion<"shxadd-load-fusion", "HasSHXADDLoadFusion", + "Enable SH(1|2|3)ADD(.UW) + load macrofusion", + CheckOpcode<[SH1ADD, SH2ADD, SH3ADD, SH1ADD_UW, SH2ADD_UW, SH3ADD_UW]>, + CheckOpcode<Load>>; diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 838edf6..31d2b3a 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -590,12 +590,17 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1", FeatureStdExtZicboz, FeatureVendorXVentanaCondOps], [TuneVentanaVeyron, + TuneDisableMISchedLoadClustering, + TuneDisablePostMISchedLoadClustering, + TuneDisablePostMISchedStoreClustering, TuneLUIADDIFusion, TuneAUIPCADDIFusion, TuneZExtHFusion, TuneZExtWFusion, TuneShiftedZExtWFusion, - TuneLDADDFusion]> { + TuneADDLoadFusion, + TuneAUIPCLoadFusion, + TuneLUILoadFusion]> { let MVendorID = 0x61f; let MArchID = 0x8000000000010000; let MImpID = 0x111; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index bf23812..5541506 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -13,78 +13,113 @@ // //===----------------------------------------------------------------------===// -class SMX60IsWorstCaseMX<string mx, list<string> MxList> { - string LLMUL = LargestLMUL<MxList>.r; - bit c = !eq(mx, LLMUL); -} +//===----------------------------------------------------------------------===// +// Helpers + +// Maps LMUL string to corresponding value from the Values array +// LMUL values map to array indices as follows: +// MF8 -> Values[0], MF4 -> Values[1], MF2 -> Values[2], M1 -> Values[3], +// M2 -> Values[4], M4 -> Values[5], M8 -> Values[6] +// Shorter lists are allowed, e.g., widening instructions don't work on M8 +class GetLMULValue<list<int> Values, string LMUL> { + defvar Index = !cond( + !eq(LMUL, "MF8"): 0, + !eq(LMUL, "MF4"): 1, + !eq(LMUL, "MF2"): 2, + !eq(LMUL, "M1"): 3, + !eq(LMUL, "M2"): 4, + !eq(LMUL, "M4"): 5, + !eq(LMUL, "M8"): 6, + ); -class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> { - string LLMUL = LargestLMUL<MxList>.r; - int SSEW = SmallestSEW<mx, isF>.r; - bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW)); + assert !lt(Index, !size(Values)), + "Missing LMUL value for '" # LMUL # "'. " # + "Expected at least " # !add(Index, 1) # " elements, but got " # + !size(Values) # "."; + + int c = Values[Index]; } -defvar SMX60VLEN = 256; -defvar SMX60DLEN = !div(SMX60VLEN, 2); +// Returns BaseValue for LMUL values before startLMUL, Value for startLMUL, +// then doubles Value for each subsequent LMUL +// Example: ConstValueUntilLMULThenDoubleBase<"M1", 2, 4, "M8"> returns: +// MF8->2, MF4->2, MF2->2, M1->4, M2->8, M4->16, M8->32 +// This is useful for modeling scheduling parameters that scale with LMUL. +class ConstValueUntilLMULThenDoubleBase<string startLMUL, int BaseValue, int Value, string currentLMUL> { + assert !le(BaseValue, Value), "BaseValue must be less-equal to Value"; + defvar startPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], startLMUL>.c; + defvar currentPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], currentLMUL>.c; -class Get1248Latency<string mx> { + // Calculate the difference in positions + defvar posDiff = !sub(currentPos, startPos); + + // Calculate Value * (2^posDiff) int c = !cond( - !eq(mx, "M2") : 2, - !eq(mx, "M4") : 4, - !eq(mx, "M8") : 8, - true: 1 + !eq(posDiff, 0) : Value, + !eq(posDiff, 1) : !mul(Value, 2), + !eq(posDiff, 2) : !mul(Value, 4), + !eq(posDiff, 3) : !mul(Value, 8), + !eq(posDiff, 4) : !mul(Value, 16), + !eq(posDiff, 5) : !mul(Value, 32), + !eq(posDiff, 6) : !mul(Value, 64), + true : BaseValue ); } -// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides -class Get4816Latency<string mx> { - int c = !cond( - !eq(mx, "M4") : 8, - !eq(mx, "M8") : 16, - true: 4 - ); +// Same as the previous function but BaseValue == Value +class ConstValueUntilLMULThenDouble<string startLMUL, int Value, string currentLMUL> { + int c = ConstValueUntilLMULThenDoubleBase<startLMUL, Value, Value, currentLMUL>.c; +} + +// Returns MF8->1, MF4->1, MF2->2, M1->4, M2->8, M4->16, M8->32 +class ConstOneUntilMF4ThenDouble<string mx> { + int c = ConstValueUntilLMULThenDouble<"MF4", 1, mx>.c; +} + +// Returns MF8->1, MF4->1, MF2->1, M1->2, M2->4, M4->8, M8->16 +class ConstOneUntilMF2ThenDouble<string mx> { + int c = ConstValueUntilLMULThenDouble<"MF2", 1, mx>.c; +} + +// Returns MF8->1, MF4->1, MF2->1, M1->1, M2->2, M4->4, M8->8 +class ConstOneUntilM1ThenDouble<string mx> { + int c = ConstValueUntilLMULThenDouble<"M1", 1, mx>.c; } +//===----------------------------------------------------------------------===// +// Latency helper classes + // Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max -class Get458Latency<string mx> { - int c = !cond( - !eq(mx, "M4") : 5, - !eq(mx, "M8") : 8, - true: 4 - ); +class Get4458Latency<string mx> { + int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/4, /*M4=*/5, /*M8=*/8], mx>.c; } -// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs -// Used for: widening operations +// Used for: widening operations (no M8) class Get4588Latency<string mx> { - int c = !cond( - !eq(mx, "M2") : 5, - !eq(mx, "M4") : 8, - !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback - true: 4 - ); + int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/5, /*M4=*/8], mx>.c; } // Used for: mask-producing comparisons, carry ops with mask, FP comparisons class Get461018Latency<string mx> { - int c = !cond( - !eq(mx, "M2") : 6, - !eq(mx, "M4") : 10, - !eq(mx, "M8") : 18, - true: 4 - ); + int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c; } -// Used for: e64 multiply pattern, complex ops -class Get781632Latency<string mx> { - int c = !cond( - !eq(mx, "M2") : 8, - !eq(mx, "M4") : 16, - !eq(mx, "M8") : 32, - true: 7 - ); +//===----------------------------------------------------------------------===// + +class SMX60IsWorstCaseMX<string mx, list<string> MxList> { + string LLMUL = LargestLMUL<MxList>.r; + bit c = !eq(mx, LLMUL); } +class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> { + string LLMUL = LargestLMUL<MxList>.r; + int SSEW = SmallestSEW<mx, isF>.r; + bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW)); +} + +defvar SMX60VLEN = 256; +defvar SMX60DLEN = !div(SMX60VLEN, 2); + def SpacemitX60Model : SchedMachineModel { let IssueWidth = 2; // dual-issue let MicroOpBufferSize = 0; // in-order @@ -383,12 +418,13 @@ foreach LMul = [1, 2, 4, 8] in { foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in { + let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [4] in { defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>; } - let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in { + defvar VIALULat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + let Latency = VIALULat, ReleaseAtCycles = [4] in { // Pattern of vadd, vsub, vrsub: 4/4/5/8 // Pattern of vand, vor, vxor: 4/4/8/16 // They are grouped together, so we used the worst case 4/4/8/16 @@ -425,7 +461,7 @@ foreach mx = SchedMxList in { // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8, // e64 = 7,8,16,32. We use the worst-case until we can split the SEW. // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites - let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in { + let Latency = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c, ReleaseAtCycles = [7] in { defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>; @@ -461,15 +497,8 @@ foreach mx = SchedMxList in { foreach sew = SchedSEWSet<mx>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; - // Slightly reduced for fractional LMULs - defvar Multiplier = !cond( - !eq(mx, "MF8") : 12, - !eq(mx, "MF4") : 12, - !eq(mx, "MF2") : 12, - true: 24 - ); - - let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in { + defvar VIDivLat = ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c; + let Latency = VIDivLat, ReleaseAtCycles = [12] in { defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>; defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>; } @@ -480,14 +509,8 @@ foreach mx = SchedMxList in { foreach mx = SchedMxListW in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; - // Slightly increased for integer LMULs - defvar Multiplier = !cond( - !eq(mx, "M2") : 2, - !eq(mx, "M4") : 2, - true: 1 - ); - - let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in { + defvar VNarrowingLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c; + let Latency = VNarrowingLat, ReleaseAtCycles = [4] in { defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>; diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 3f2a83f..66ce134 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -94,16 +94,6 @@ static cl::opt<bool> cl::desc("Enable the loop data prefetch pass"), cl::init(true)); -static cl::opt<bool> EnableMISchedLoadStoreClustering( - "riscv-misched-load-store-clustering", cl::Hidden, - cl::desc("Enable load and store clustering in the machine scheduler"), - cl::init(true)); - -static cl::opt<bool> EnablePostMISchedLoadStoreClustering( - "riscv-postmisched-load-store-clustering", cl::Hidden, - cl::desc("Enable PostRA load and store clustering in the machine scheduler"), - cl::init(true)); - static cl::opt<bool> DisableVectorMaskMutation( "riscv-disable-vector-mask-mutation", cl::desc("Disable the vector mask scheduling mutation"), cl::init(false), @@ -294,15 +284,17 @@ bool RISCVTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, ScheduleDAGInstrs * RISCVTargetMachine::createMachineScheduler(MachineSchedContext *C) const { + const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>(); ScheduleDAGMILive *DAG = createSchedLive(C); - if (EnableMISchedLoadStoreClustering) { + + if (ST.enableMISchedLoadClustering()) DAG->addMutation(createLoadClusterDAGMutation( DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true)); + + if (ST.enableMISchedStoreClustering()) DAG->addMutation(createStoreClusterDAGMutation( DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true)); - } - const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>(); if (!DisableVectorMaskMutation && ST.hasVInstructions()) DAG->addMutation(createRISCVVectorMaskDAGMutation(DAG->TRI)); @@ -311,13 +303,16 @@ RISCVTargetMachine::createMachineScheduler(MachineSchedContext *C) const { ScheduleDAGInstrs * RISCVTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { + const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>(); ScheduleDAGMI *DAG = createSchedPostRA(C); - if (EnablePostMISchedLoadStoreClustering) { + + if (ST.enablePostMISchedLoadClustering()) DAG->addMutation(createLoadClusterDAGMutation( DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true)); + + if (ST.enablePostMISchedStoreClustering()) DAG->addMutation(createStoreClusterDAGMutation( DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true)); - } return DAG; } diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt index ba09451..6660de9 100644 --- a/llvm/lib/Target/SPIRV/CMakeLists.txt +++ b/llvm/lib/Target/SPIRV/CMakeLists.txt @@ -26,6 +26,7 @@ add_llvm_target(SPIRVCodeGen SPIRVGlobalRegistry.cpp SPIRVInstrInfo.cpp SPIRVInstructionSelector.cpp + SPIRVLegalizeImplicitBinding.cpp SPIRVStripConvergentIntrinsics.cpp SPIRVLegalizePointerCast.cpp SPIRVMergeRegionExitTargets.cpp diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h index 1688fa3..1934e98 100644 --- a/llvm/lib/Target/SPIRV/SPIRV.h +++ b/llvm/lib/Target/SPIRV/SPIRV.h @@ -23,6 +23,7 @@ ModulePass *createSPIRVPrepareFunctionsPass(const SPIRVTargetMachine &TM); FunctionPass *createSPIRVStructurizerPass(); FunctionPass *createSPIRVMergeRegionExitTargetsPass(); FunctionPass *createSPIRVStripConvergenceIntrinsicsPass(); +ModulePass *createSPIRVLegalizeImplicitBindingPass(); FunctionPass *createSPIRVLegalizePointerCastPass(SPIRVTargetMachine *TM); FunctionPass *createSPIRVRegularizerPass(); FunctionPass *createSPIRVPreLegalizerCombiner(); @@ -49,6 +50,7 @@ void initializeSPIRVRegularizerPass(PassRegistry &); void initializeSPIRVMergeRegionExitTargetsPass(PassRegistry &); void initializeSPIRVPrepareFunctionsPass(PassRegistry &); void initializeSPIRVStripConvergentIntrinsicsPass(PassRegistry &); +void initializeSPIRVLegalizeImplicitBindingPass(PassRegistry &); } // namespace llvm #endif // LLVM_LIB_TARGET_SPIRV_SPIRV_H diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp new file mode 100644 index 0000000..0398e52 --- /dev/null +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp @@ -0,0 +1,159 @@ +//===- SPIRVLegalizeImplicitBinding.cpp - Legalize implicit bindings ----*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass legalizes the @llvm.spv.resource.handlefromimplicitbinding +// intrinsic by replacing it with a call to +// @llvm.spv.resource.handlefrombinding. +// +//===----------------------------------------------------------------------===// + +#include "SPIRV.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsSPIRV.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include <algorithm> +#include <vector> + +using namespace llvm; + +namespace { +class SPIRVLegalizeImplicitBinding : public ModulePass { +public: + static char ID; + SPIRVLegalizeImplicitBinding() : ModulePass(ID) {} + + bool runOnModule(Module &M) override; + +private: + void collectBindingInfo(Module &M); + uint32_t getAndReserveFirstUnusedBinding(uint32_t DescSet); + void replaceImplicitBindingCalls(Module &M); + + // A map from descriptor set to a bit vector of used binding numbers. + std::vector<BitVector> UsedBindings; + // A list of all implicit binding calls, to be sorted by order ID. + SmallVector<CallInst *, 16> ImplicitBindingCalls; +}; + +struct BindingInfoCollector : public InstVisitor<BindingInfoCollector> { + std::vector<BitVector> &UsedBindings; + SmallVector<CallInst *, 16> &ImplicitBindingCalls; + + BindingInfoCollector(std::vector<BitVector> &UsedBindings, + SmallVector<CallInst *, 16> &ImplicitBindingCalls) + : UsedBindings(UsedBindings), ImplicitBindingCalls(ImplicitBindingCalls) { + } + + void visitCallInst(CallInst &CI) { + if (CI.getIntrinsicID() == Intrinsic::spv_resource_handlefrombinding) { + const uint32_t DescSet = + cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue(); + const uint32_t Binding = + cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue(); + + if (UsedBindings.size() <= DescSet) { + UsedBindings.resize(DescSet + 1); + UsedBindings[DescSet].resize(64); + } + if (UsedBindings[DescSet].size() <= Binding) { + UsedBindings[DescSet].resize(2 * Binding + 1); + } + UsedBindings[DescSet].set(Binding); + } else if (CI.getIntrinsicID() == + Intrinsic::spv_resource_handlefromimplicitbinding) { + ImplicitBindingCalls.push_back(&CI); + } + } +}; + +void SPIRVLegalizeImplicitBinding::collectBindingInfo(Module &M) { + BindingInfoCollector InfoCollector(UsedBindings, ImplicitBindingCalls); + InfoCollector.visit(M); + + // Sort the collected calls by their order ID. + std::sort( + ImplicitBindingCalls.begin(), ImplicitBindingCalls.end(), + [](const CallInst *A, const CallInst *B) { + const uint32_t OrderIdArgIdx = 0; + const uint32_t OrderA = + cast<ConstantInt>(A->getArgOperand(OrderIdArgIdx))->getZExtValue(); + const uint32_t OrderB = + cast<ConstantInt>(B->getArgOperand(OrderIdArgIdx))->getZExtValue(); + return OrderA < OrderB; + }); +} + +uint32_t SPIRVLegalizeImplicitBinding::getAndReserveFirstUnusedBinding( + uint32_t DescSet) { + if (UsedBindings.size() <= DescSet) { + UsedBindings.resize(DescSet + 1); + UsedBindings[DescSet].resize(64); + } + + int NewBinding = UsedBindings[DescSet].find_first_unset(); + if (NewBinding == -1) { + NewBinding = UsedBindings[DescSet].size(); + UsedBindings[DescSet].resize(2 * NewBinding + 1); + } + + UsedBindings[DescSet].set(NewBinding); + return NewBinding; +} + +void SPIRVLegalizeImplicitBinding::replaceImplicitBindingCalls(Module &M) { + for (CallInst *OldCI : ImplicitBindingCalls) { + IRBuilder<> Builder(OldCI); + const uint32_t DescSet = + cast<ConstantInt>(OldCI->getArgOperand(1))->getZExtValue(); + const uint32_t NewBinding = getAndReserveFirstUnusedBinding(DescSet); + + SmallVector<Value *, 8> Args; + Args.push_back(Builder.getInt32(DescSet)); + Args.push_back(Builder.getInt32(NewBinding)); + + // Copy the remaining arguments from the old call. + for (uint32_t i = 2; i < OldCI->arg_size(); ++i) { + Args.push_back(OldCI->getArgOperand(i)); + } + + Function *NewFunc = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::spv_resource_handlefrombinding, OldCI->getType()); + CallInst *NewCI = Builder.CreateCall(NewFunc, Args); + NewCI->setCallingConv(OldCI->getCallingConv()); + + OldCI->replaceAllUsesWith(NewCI); + OldCI->eraseFromParent(); + } +} + +bool SPIRVLegalizeImplicitBinding::runOnModule(Module &M) { + collectBindingInfo(M); + if (ImplicitBindingCalls.empty()) { + return false; + } + + replaceImplicitBindingCalls(M); + return true; +} +} // namespace + +char SPIRVLegalizeImplicitBinding::ID = 0; + +INITIALIZE_PASS(SPIRVLegalizeImplicitBinding, "legalize-spirv-implicit-binding", + "Legalize SPIR-V implicit bindings", false, false) + +ModulePass *llvm::createSPIRVLegalizeImplicitBindingPass() { + return new SPIRVLegalizeImplicitBinding(); +}
\ No newline at end of file diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index d7cf211..e0bfb77 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -226,6 +226,7 @@ void SPIRVPassConfig::addIRPasses() { } void SPIRVPassConfig::addISelPrepare() { + addPass(createSPIRVLegalizeImplicitBindingPass()); addPass(createSPIRVEmitIntrinsicsPass(&getTM<SPIRVTargetMachine>())); if (TM.getSubtargetImpl()->isLogicalSPIRV()) addPass(createSPIRVLegalizePointerCastPass(&getTM<SPIRVTargetMachine>())); diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 1aa8efe..c0fc3a6 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -1179,7 +1179,7 @@ static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs, if (!VA.isRegLoc() || (ValTy != MVT::f64 && ValTy != MVT::f128)) continue; // The fixed arguments to a varargs function still go in FP registers. - if (Outs[VA.getValNo()].IsFixed) + if (!Outs[VA.getValNo()].Flags.isVarArg()) continue; // This floating point argument should be reassigned. diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h index 25f4aac..fbb98ff 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h @@ -31,10 +31,6 @@ namespace SystemZ { class SystemZCCState : public CCState { private: - /// Records whether the value was a fixed argument. - /// See ISD::OutputArg::IsFixed. - SmallVector<bool, 4> ArgIsFixed; - /// Records whether the value was widened from a short vector type. SmallVector<bool, 4> ArgIsShortVector; @@ -50,10 +46,6 @@ public: void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins, CCAssignFn Fn) { - // Formal arguments are always fixed. - ArgIsFixed.clear(); - for (unsigned i = 0; i < Ins.size(); ++i) - ArgIsFixed.push_back(true); // Record whether the call operand was a short vector. ArgIsShortVector.clear(); for (unsigned i = 0; i < Ins.size(); ++i) @@ -64,10 +56,6 @@ public: void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, CCAssignFn Fn) { - // Record whether the call operand was a fixed argument. - ArgIsFixed.clear(); - for (unsigned i = 0; i < Outs.size(); ++i) - ArgIsFixed.push_back(Outs[i].IsFixed); // Record whether the call operand was a short vector. ArgIsShortVector.clear(); for (unsigned i = 0; i < Outs.size(); ++i) @@ -77,12 +65,11 @@ public: } // This version of AnalyzeCallOperands in the base class is not usable - // since we must provide a means of accessing ISD::OutputArg::IsFixed. + // since we must provide a means of accessing ISD::OutputArg::IsShortVector. void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs, SmallVectorImpl<ISD::ArgFlagsTy> &Flags, CCAssignFn Fn) = delete; - bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; } bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; } }; diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index 0ad872b..059f31f 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -16,14 +16,6 @@ class CCIfSubtarget<string F, CCAction A> "getSubtarget<SystemZSubtarget>().", F), A>; -// Match if this specific argument is a fixed (i.e. named) argument. -class CCIfFixed<CCAction A> - : CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>; - -// Match if this specific argument is not a fixed (i.e. vararg) argument. -class CCIfNotFixed<CCAction A> - : CCIf<"!(static_cast<SystemZCCState *>(&State)->IsFixed(ValNo))", A>; - // Match if this specific argument was widened from a short vector type. class CCIfShortVector<CCAction A> : CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>; @@ -79,7 +71,7 @@ def CC_SystemZ_GHC : CallingConv<[ // Pass in STG registers: XMM1, ..., XMM6 CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCAssignToReg<[V16, V17, V18, V19, V20, V21]>>>>, + CCIfArgFixed<CCAssignToReg<[V16, V17, V18, V19, V20, V21]>>>>, // Fail otherwise CCCustom<"CC_SystemZ_GHC_Error"> @@ -125,8 +117,8 @@ def CC_SystemZ_ELF : CallingConv<[ // during type legalization. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCAssignToReg<[V24, V26, V28, V30, - V25, V27, V29, V31]>>>>, + CCIfArgFixed<CCAssignToReg<[V24, V26, V28, V30, + V25, V27, V29, V31]>>>>, // However, sub-128 vectors which need to go on the stack occupy just a // single 8-byte-aligned 8-byte stack slot. Pass as i64. @@ -227,17 +219,17 @@ def CC_SystemZ_XPLINK64 : CallingConv<[ // Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRs. // Although we assign the f32 vararg to be bitcast, it will first be promoted // to an f64 within convertValVTToLocVT(). - CCIfType<[f32, f64], CCIfNotFixed<CCBitConvertToType<i64>>>, + CCIfType<[f32, f64], CCIfArgVarArg<CCBitConvertToType<i64>>>, // Pointers are always passed in full 64-bit registers. CCIfPtr<CCCustom<"CC_XPLINK64_Pointer">>, // long double, can only be passed in GPR2 and GPR3, if available, // hence R2Q - CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>, + CCIfType<[f128], CCIfArgVarArg<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>, // Non fixed vector arguments are treated in the same way as long // doubles. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>, + CCIfArgVarArg<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>, // A SwiftSelf is passed in callee-saved R10. CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R10D]>>>, @@ -260,22 +252,24 @@ def CC_SystemZ_XPLINK64 : CallingConv<[ // during type legalization. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>, + CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>, CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCAssignToRegAndStack<[V24, V25, V26, V27, - V28, V29, V30, V31], 16, 8>>>>, + CCIfArgFixed<CCAssignToRegAndStack<[V24, V25, V26, V27, + V28, V29, V30, V31], 16, 8>>>>, // The first 4 named float and double arguments are passed in registers // FPR0-FPR6. The rest will be passed in the user area. - CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, - CCIfType<[f32], CCIfFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>, - CCIfType<[f64], CCIfFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>, + CCIfType<[f32, f64], CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, + CCIfType<[f32], + CCIfArgFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>, + CCIfType<[f64], + CCIfArgFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>, // The first 2 long double arguments are passed in register FPR0/FPR2 // and FPR4/FPR6. The rest will be passed in the user area. - CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, - CCIfType<[f128], CCIfFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>, + CCIfType<[f128], CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, + CCIfType<[f128], CCIfArgFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>, // Other arguments are passed in 8-byte-aligned 8-byte stack slots. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 6297916..5ee66e3 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -574,13 +574,11 @@ void SystemZELFFrameLowering::emitPrologue(MachineFunction &MF, // Call mcount (Regmask from CC AnyReg since mcount preserves all normal // argument registers). - FunctionCallee FC = MF.getFunction().getParent()->getOrInsertFunction( - "mcount", Type::getVoidTy(MF.getFunction().getContext())); const uint32_t *Mask = MF.getSubtarget<SystemZSubtarget>() .getSpecialRegisters() ->getCallPreservedMask(MF, CallingConv::AnyReg); BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::CallBRASL)) - .addGlobalAddress(dyn_cast<Function>(FC.getCallee())) + .addExternalSymbol("mcount") .addRegMask(Mask); // Reload return address from 8 bytes above stack pointer. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 3f80b2a..f9eba4b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1309,7 +1309,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, OutVal = FINode; } // Count the number of fixed args *after* legalization. - NumFixedArgs += Out.IsFixed; + NumFixedArgs += !Out.Flags.isVarArg(); } bool IsVarArg = CLI.IsVarArg; @@ -1503,7 +1503,7 @@ SDValue WebAssemblyTargetLowering::LowerReturn( for (const ISD::OutputArg &Out : Outs) { assert(!Out.Flags.isByVal() && "byval is not valid for return values"); assert(!Out.Flags.isNest() && "nest is not valid for return values"); - assert(Out.IsFixed && "non-fixed return value is not valid"); + assert(!Out.Flags.isVarArg() && "non-fixed return value is not valid"); if (Out.Flags.isInAlloca()) fail(DL, DAG, "WebAssembly hasn't implemented inalloca results"); if (Out.Flags.isInConsecutiveRegs()) diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp index c0a6035..d9f4405 100644 --- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp +++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp @@ -75,7 +75,7 @@ public: static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7}; - if (!Info.IsFixed) + if (Flags.isVarArg()) NumXMMRegs = State.getFirstUnallocated(XMMArgRegs); return Res; @@ -363,7 +363,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, Info.CallConv, Info.IsVarArg)) return false; - bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed; + bool IsFixed = + Info.OrigArgs.empty() ? true : !Info.OrigArgs.back().Flags[0].isVarArg(); if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs |