diff options
Diffstat (limited to 'llvm/lib/Target')
22 files changed, 421 insertions, 346 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2987468..40e6400 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -50,6 +50,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" @@ -96,7 +97,6 @@ #include <cctype> #include <cstdint> #include <cstdlib> -#include <deque> #include <iterator> #include <limits> #include <optional> @@ -105,7 +105,6 @@ #include <vector> using namespace llvm; -using namespace llvm::PatternMatch; #define DEBUG_TYPE "aarch64-lower" @@ -1175,6 +1174,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE); + setTargetDAGCombine(ISD::CTPOP); // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; @@ -11331,9 +11331,10 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op, break; } + // Note: This lowering only overrides NEON for v1i64 and v2i64, where we + // prefer using SVE if available. if (VT.isScalableVector() || - useSVEForFixedLengthVectorVT( - VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) { + useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) { switch (Opcode) { default: llvm_unreachable("Wrong instruction"); @@ -17555,6 +17556,7 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion( // udot instruction. if (SrcWidth * 4 <= DstWidth) { if (all_of(I->users(), [&](auto *U) { + using namespace llvm::PatternMatch; auto *SingleUser = cast<Instruction>(&*U); if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value())))) return true; @@ -17826,6 +17828,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad( // into shift / and masks. For the moment we do this just for uitofp (not // zext) to avoid issues with widening instructions. if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) { + using namespace llvm::PatternMatch; return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) && SI->getType()->getScalarSizeInBits() * 4 == SI->user_back()->getType()->getScalarSizeInBits(); @@ -17990,17 +17993,11 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, unsigned Factor, const APInt &GapMask) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); auto *SI = dyn_cast<StoreInst>(Store); if (!SI) return false; - - if (isProfitableToInterleaveWithGatherScatter() && - Factor > getMaxSupportedInterleaveFactor()) - return lowerInterleavedStoreWithShuffle(SI, SVI, Factor); - - assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && - "Invalid interleave factor"); - assert(!LaneMask && GapMask.popcount() == Factor && "Unexpected mask on store"); @@ -18146,126 +18143,6 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, return true; } -/// If the interleaved vector elements are greater than supported MaxFactor, -/// interleaving the data with additional shuffles can be used to -/// achieve the same. -/// -/// Consider the following data with 8 interleaves which are shuffled to store -/// stN instructions. Data needs to be stored in this order: -/// [v0, v1, v2, v3, v4, v5, v6, v7] -/// -/// v0 v4 v2 v6 v1 v5 v3 v7 -/// | | | | | | | | -/// \ / \ / \ / \ / -/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7] ==> stN = 4 -/// | | | | -/// \ / \ / -/// \ / \ / -/// \ / \ / -/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2 -/// -/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored -/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with -/// another st4. -/// -/// For stN = 2, upper half of interleaved data V0, V1 is stored -/// with one st2 instruction. Second set V2, V3 is stored with another st2. -/// Total of 4 st2's are required here. -bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle( - StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { - unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor(); - - auto *VecTy = cast<FixedVectorType>(SVI->getType()); - assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); - - unsigned LaneLen = VecTy->getNumElements() / Factor; - Type *EltTy = VecTy->getElementType(); - auto *SubVecTy = FixedVectorType::get(EltTy, Factor); - - const DataLayout &DL = SI->getModule()->getDataLayout(); - bool UseScalable; - - // Skip if we do not have NEON and skip illegal vector types. We can - // "legalize" wide vector types into multiple interleaved accesses as long as - // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || - !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) - return false; - - if (UseScalable) - return false; - - std::deque<Value *> Shuffles; - Shuffles.push_back(SVI); - unsigned ConcatLevel = Factor; - // Getting all the interleaved operands. - while (ConcatLevel > 1) { - unsigned InterleavedOperands = Shuffles.size(); - for (unsigned i = 0; i < InterleavedOperands; i++) { - ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(Shuffles.front()); - if (!SFL) - return false; - Shuffles.pop_front(); - - Value *Op0 = SFL->getOperand(0); - Value *Op1 = SFL->getOperand(1); - - Shuffles.push_back(dyn_cast<Value>(Op0)); - Shuffles.push_back(dyn_cast<Value>(Op1)); - } - ConcatLevel >>= 1; - } - - IRBuilder<> Builder(SI); - auto Mask = createInterleaveMask(LaneLen, 2); - SmallVector<int, 16> UpperHalfMask(LaneLen), LowerHalfMask(LaneLen); - for (unsigned i = 0; i < LaneLen; i++) { - LowerHalfMask[i] = Mask[i]; - UpperHalfMask[i] = Mask[i + LaneLen]; - } - - unsigned InterleaveFactor = Factor >> 1; - while (InterleaveFactor >= MaxSupportedFactor) { - std::deque<Value *> ShufflesIntermediate; - ShufflesIntermediate.resize(Factor); - for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) { - for (unsigned i = 0; i < InterleaveFactor; i++) { - auto *Shuffle = Builder.CreateShuffleVector( - Shuffles[i + j], Shuffles[i + j + InterleaveFactor], LowerHalfMask); - ShufflesIntermediate[i + j] = Shuffle; - Shuffle = Builder.CreateShuffleVector( - Shuffles[i + j], Shuffles[i + j + InterleaveFactor], UpperHalfMask); - ShufflesIntermediate[i + j + InterleaveFactor] = Shuffle; - } - } - Shuffles = ShufflesIntermediate; - InterleaveFactor >>= 1; - } - - Type *PtrTy = SI->getPointerOperandType(); - auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); - - Value *BaseAddr = SI->getPointerOperand(); - Function *StNFunc = getStructuredStoreFunction( - SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy); - for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) { - SmallVector<Value *, 5> Ops; - for (unsigned j = 0; j < MaxSupportedFactor; j++) - Ops.push_back(Shuffles[i * MaxSupportedFactor + j]); - - if (i > 0) { - // We will compute the pointer operand of each store from the original - // base address using GEPs. Cast the base address to a pointer to the - // scalar element type. - BaseAddr = Builder.CreateConstGEP1_32( - SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor); - } - Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); - Builder.CreateCall(StNFunc, Ops); - } - return true; -} - bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( Instruction *Load, Value *Mask, IntrinsicInst *DI) const { const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); @@ -27968,6 +27845,35 @@ static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) { {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); } +static SDValue performCTPOPCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + using namespace llvm::SDPatternMatch; + if (!DCI.isBeforeLegalize()) + return SDValue(); + + // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask)) + SDValue Mask; + if (!sd_match(N->getOperand(0), m_ZExt(m_BitCast(m_Value(Mask))))) + return SDValue(); + + EVT VT = N->getValueType(0); + EVT MaskVT = Mask.getValueType(); + + if (VT.isVector() || !MaskVT.isFixedLengthVector() || + MaskVT.getVectorElementType() != MVT::i1) + return SDValue(); + + EVT ReduceInVT = + EVT::getVectorVT(*DAG.getContext(), VT, MaskVT.getVectorElementCount()); + + SDLoc DL(N); + // Sign extend to best fit ZeroOrNegativeOneBooleanContent. + SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask); + SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask); + return DAG.getNegative(NegPopCount, DL, VT); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -28313,6 +28219,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performScalarToVectorCombine(N, DCI, DAG); case ISD::SHL: return performSHLCombine(N, DCI, DAG); + case ISD::CTPOP: + return performCTPOPCombine(N, DCI, DAG); } return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index bfd8474..70bfae7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -229,10 +229,6 @@ public: bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override; - bool isProfitableToInterleaveWithGatherScatter() const override { - return true; - } - unsigned getMaxSupportedInterleaveFactor() const override { return 4; } bool lowerInterleavedLoad(Instruction *Load, Value *Mask, @@ -243,9 +239,6 @@ public: ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override; - bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI, - unsigned Factor) const; - bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 8729ed3..197aae6 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4922,36 +4922,11 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) return InstructionCost::getInvalid(); - unsigned NumLoadStores = 1; - InstructionCost ShuffleCost = 0; - bool isInterleaveWithShuffle = false; - unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor(); - - auto *SubVecTy = - VectorType::get(VecVTy->getElementType(), - VecVTy->getElementCount().divideCoefficientBy(Factor)); - - if (TLI->isProfitableToInterleaveWithGatherScatter() && - Opcode == Instruction::Store && (0 == Factor % MaxSupportedFactor) && - Factor > MaxSupportedFactor) { - isInterleaveWithShuffle = true; - SmallVector<int, 16> Mask; - // preparing interleave Mask. - for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2; - i++) { - for (unsigned j = 0; j < 2; j++) - Mask.push_back(j * Factor + i); - } - - NumLoadStores = Factor / MaxSupportedFactor; - ShuffleCost = - (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy, - Mask, CostKind, 0, SubVecTy)); - } - - if (!UseMaskForGaps && - (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) { + if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); + auto *SubVecTy = + VectorType::get(VecVTy->getElementType(), + VecVTy->getElementCount().divideCoefficientBy(Factor)); // ldN/stN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be @@ -4959,10 +4934,7 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( bool UseScalable; if (MinElts % Factor == 0 && TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) - return (Factor * - TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) * - NumLoadStores) + - ShuffleCost; + return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index e3b0a1b..e62fdb6 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -312,7 +312,7 @@ public: } bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const { - if (!ST->hasSVE()) + if (!ST->isSVEorStreamingSVEAvailable()) return false; // For fixed vectors, avoid scalarization if using SVE for them. diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 4fe194c..54d94b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2366,18 +2366,6 @@ def isGFX8GFX9NotGFX90A : " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">, AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>; -// Pre-90A GFX9s allow the NV bit in FLAT instructions. -def isNVAllowedInFlat : - Predicate<"!Subtarget->hasGFX90AInsts() &&" - " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">, - AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureGFX90AInsts), (not FeatureGFX10Insts))>; - -// GFX8 or GFX90A+ do not allow the NV bit in FLAT instructions. -def isNVNotAllowedInFlat : - Predicate<"(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) ||" - " ((Subtarget->getGeneration() == AMDGPUSubtarget::GFX9) && Subtarget->hasGFX90AInsts())">, - AssemblerPredicate <(any_of FeatureVolcanicIslands, FeatureGFX90AInsts)>; - def isGFX90AOnly : Predicate<"Subtarget->hasGFX90AInsts() && !Subtarget->hasGFX940Insts()">, AssemblerPredicate<(all_of FeatureGFX90AInsts, (not FeatureGFX940Insts))>; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2808c44..09338c5 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1602,11 +1602,6 @@ public: bool hasKernargPreload() const { return AMDGPU::hasKernargPreload(getSTI()); } - bool isFlatInstAndNVAllowed(const MCInst &Inst) const { - uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; - return (TSFlags & SIInstrFlags::FLAT) && isGFX9() && !isGFX90A(); - } - AMDGPUTargetStreamer &getTargetStreamer() { MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); return static_cast<AMDGPUTargetStreamer &>(TS); @@ -5375,7 +5370,7 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]); Error(S, "scale_offset is not supported on this GPU"); } - if ((CPol & CPol::NV) && !isFlatInstAndNVAllowed(Inst)) { + if (CPol & CPol::NV) { SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); StringRef CStr(S.getPointer()); S = SMLoc::getFromPointer(&CStr.data()[CStr.find("nv")]); @@ -7150,13 +7145,6 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) { unsigned Enabled = 0, Seen = 0; for (;;) { SMLoc S = getLoc(); - - if (isGFX9() && trySkipId("nv")) { - Enabled |= CPol::NV; - Seen |= CPol::NV; - continue; - } - bool Disabling; unsigned CPol = getCPolKind(getId(), Mnemo, Disabling); if (!CPol) diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 6ef2241..8ea64d1 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -125,7 +125,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : bits<7> saddr; bits<10> vdst; - bits<6> cpol; + bits<5> cpol; // Only valid on gfx9 bits<1> lds = ps.lds; // LDS DMA for global and scratch @@ -2693,52 +2693,29 @@ class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> : !subst("$sccb", !if(has_sccb, "$sccb",""), ps.AsmOperands); } -class FLAT_Real_vi_ex_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> : - FLAT_Real_vi <op, ps, has_sccb> { - let AssemblerPredicate = isNVNotAllowedInFlat; -} - -class FLAT_Real_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> : - FLAT_Real_vi <op, ps, has_sccb> { - let AssemblerPredicate = isNVAllowedInFlat; - let Subtarget = SIEncodingFamily.GFX9; - let DecoderNamespace = "GFX9"; - let Inst{55} = cpol{CPolBit.NV}; // nv - GFX9 (pre-90A) uses bit 55 as the non-volatile bit. -} - -multiclass FLAT_Real_mc_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> { - def _vi: FLAT_Real_vi_ex_gfx9<op, ps, has_sccb>; - def _gfx9: FLAT_Real_gfx9<op, ps, has_sccb>; -} - multiclass FLAT_Real_AllAddr_vi<bits<7> op, bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> { - defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>; - defm _SADDR : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>; -} - -multiclass FLAT_Real_AllAddr_vi_ex_gfx9<bits<7> op, - bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> { - def _vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME), has_sccb>; - def _SADDR_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>; + def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>; + def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>; } class FLAT_Real_gfx940 <bits<7> op, FLAT_Pseudo ps> : FLAT_Real <op, ps>, SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX940> { let AssemblerPredicate = isGFX940Plus; - let DecoderNamespace = "GFX940"; + let DecoderNamespace = "GFX9"; let Inst{13} = ps.sve; let Inst{25} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccbValue); } multiclass FLAT_Real_AllAddr_SVE_vi<bits<7> op> { - let OtherPredicates = [isGFX8GFX9NotGFX940] in { - defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME)>; + def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)> { + let AssemblerPredicate = isGFX8GFX9NotGFX940; + let OtherPredicates = [isGFX8GFX9NotGFX940]; + } + def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")> { + let DecoderNamespace = "GFX9"; } - - defm _SADDR_vi : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>; - let AssemblerPredicate = isGFX940Plus in { def _VE_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>; def _SVS_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SVS")>; @@ -2751,11 +2728,11 @@ multiclass FLAT_Real_AllAddr_LDS<bits<7> op, bits<7> pre_gfx940_op, bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> { let OtherPredicates = [isGFX8GFX9NotGFX940] in { - let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds" in { - defm "" : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb>; + def _vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb> { + let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds"; } - let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds" in { - defm _SADDR : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>; + def _SADDR_vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb> { + let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds"; } } @@ -2771,66 +2748,47 @@ multiclass FLAT_Real_AllAddr_SVE_LDS<bits<7> op, bits<7> pre_gfx940_op> { def _ST_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_ST")>; } -defm FLAT_LOAD_UBYTE_vi : FLAT_Real_mc_vi <0x10, FLAT_LOAD_UBYTE>; -defm FLAT_LOAD_SBYTE_vi : FLAT_Real_mc_vi <0x11, FLAT_LOAD_SBYTE>; -defm FLAT_LOAD_USHORT_vi : FLAT_Real_mc_vi <0x12, FLAT_LOAD_USHORT>; -defm FLAT_LOAD_SSHORT_vi : FLAT_Real_mc_vi <0x13, FLAT_LOAD_SSHORT>; -defm FLAT_LOAD_DWORD_vi : FLAT_Real_mc_vi <0x14, FLAT_LOAD_DWORD>; -defm FLAT_LOAD_DWORDX2_vi : FLAT_Real_mc_vi <0x15, FLAT_LOAD_DWORDX2>; -defm FLAT_LOAD_DWORDX4_vi : FLAT_Real_mc_vi <0x17, FLAT_LOAD_DWORDX4>; -defm FLAT_LOAD_DWORDX3_vi : FLAT_Real_mc_vi <0x16, FLAT_LOAD_DWORDX3>; - -defm FLAT_STORE_BYTE_vi : FLAT_Real_mc_vi <0x18, FLAT_STORE_BYTE>; -defm FLAT_STORE_BYTE_D16_HI_vi : FLAT_Real_mc_vi <0x19, FLAT_STORE_BYTE_D16_HI>; -defm FLAT_STORE_SHORT_vi : FLAT_Real_mc_vi <0x1a, FLAT_STORE_SHORT>; -defm FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x1b, FLAT_STORE_SHORT_D16_HI>; -defm FLAT_STORE_DWORD_vi : FLAT_Real_mc_vi <0x1c, FLAT_STORE_DWORD>; -defm FLAT_STORE_DWORDX2_vi : FLAT_Real_mc_vi <0x1d, FLAT_STORE_DWORDX2>; -defm FLAT_STORE_DWORDX4_vi : FLAT_Real_mc_vi <0x1f, FLAT_STORE_DWORDX4>; -defm FLAT_STORE_DWORDX3_vi : FLAT_Real_mc_vi <0x1e, FLAT_STORE_DWORDX3>; - -defm FLAT_LOAD_UBYTE_D16_vi : FLAT_Real_mc_vi <0x20, FLAT_LOAD_UBYTE_D16>; -defm FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>; -defm FLAT_LOAD_SBYTE_D16_vi : FLAT_Real_mc_vi <0x22, FLAT_LOAD_SBYTE_D16>; -defm FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>; -defm FLAT_LOAD_SHORT_D16_vi : FLAT_Real_mc_vi <0x24, FLAT_LOAD_SHORT_D16>; -defm FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x25, FLAT_LOAD_SHORT_D16_HI>; +def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>; +def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>; +def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>; +def FLAT_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, FLAT_LOAD_SSHORT>; +def FLAT_LOAD_DWORD_vi : FLAT_Real_vi <0x14, FLAT_LOAD_DWORD>; +def FLAT_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, FLAT_LOAD_DWORDX2>; +def FLAT_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, FLAT_LOAD_DWORDX4>; +def FLAT_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, FLAT_LOAD_DWORDX3>; + +def FLAT_STORE_BYTE_vi : FLAT_Real_vi <0x18, FLAT_STORE_BYTE>; +def FLAT_STORE_BYTE_D16_HI_vi : FLAT_Real_vi <0x19, FLAT_STORE_BYTE_D16_HI>; +def FLAT_STORE_SHORT_vi : FLAT_Real_vi <0x1a, FLAT_STORE_SHORT>; +def FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_vi <0x1b, FLAT_STORE_SHORT_D16_HI>; +def FLAT_STORE_DWORD_vi : FLAT_Real_vi <0x1c, FLAT_STORE_DWORD>; +def FLAT_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, FLAT_STORE_DWORDX2>; +def FLAT_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, FLAT_STORE_DWORDX4>; +def FLAT_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, FLAT_STORE_DWORDX3>; + +def FLAT_LOAD_UBYTE_D16_vi : FLAT_Real_vi <0x20, FLAT_LOAD_UBYTE_D16>; +def FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>; +def FLAT_LOAD_SBYTE_D16_vi : FLAT_Real_vi <0x22, FLAT_LOAD_SBYTE_D16>; +def FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>; +def FLAT_LOAD_SHORT_D16_vi : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>; +def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>; multiclass FLAT_Real_Atomics_vi <bits<7> op, bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> { defvar ps = !cast<FLAT_Pseudo>(NAME); - defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>; - defm _RTN : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>; - def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>; -} - -multiclass FLAT_Real_Atomics_vi_ex_gfx9 <bits<7> op, - bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> { - defvar ps = !cast<FLAT_Pseudo>(NAME); - def _vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>; - def _RTN_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>; - - def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>; + def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>; + def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>; + def _RTN_agpr_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>; } multiclass FLAT_Global_Real_Atomics_vi<bits<7> op, bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> : FLAT_Real_AllAddr_vi<op, has_sccb> { - defm _RTN : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>; - defm _SADDR_RTN : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>; - - def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>; - def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>; -} - -multiclass FLAT_Global_Real_Atomics_vi_ex_gfx9<bits<7> op, - bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> : - FLAT_Real_AllAddr_vi_ex_gfx9<op, has_sccb> { - def _RTN_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>; - def _SADDR_RTN_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>; + def _RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>; + def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>; - def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>; - def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>; + def _RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>; + def _SADDR_RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>; } defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_vi <0x40>; @@ -2992,10 +2950,10 @@ let AssemblerPredicate = isGFX940Plus in { defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_gfx940<0x4f>; defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_gfx940<0x50>; defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_gfx940<0x51>; - defm FLAT_ATOMIC_ADD_F32 : FLAT_Real_Atomics_vi_ex_gfx9<0x4d>; - defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Real_Atomics_vi_ex_gfx9<0x4e>; - defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Real_Atomics_vi_ex_gfx9<0x52>; - defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi_ex_gfx9<0x52>; + defm FLAT_ATOMIC_ADD_F32 : FLAT_Real_Atomics_vi<0x4d>; + defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Real_Atomics_vi<0x4e>; + defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Real_Atomics_vi<0x52>; + defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>; } // End AssemblerPredicate = isGFX940Plus //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 3e6f35d..703ec0a 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -186,12 +186,8 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo, O << " dlc"; if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI)) O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc"); - if (Imm & ~CPol::ALL_pregfx12) { - if ((Imm & CPol::NV) && AMDGPU::isGFX9(STI) && !AMDGPU::isGFX90A(STI)) - O << " nv"; - else - O << " /* unexpected cache policy bit */"; - } + if (Imm & ~CPol::ALL_pregfx12) + O << " /* unexpected cache policy bit */"; } void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope, diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index c89212d..90a4723 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -756,6 +756,155 @@ LoongArchInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { return ArrayRef(TargetFlags); } +bool LoongArchInstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI, + Register Reg, + const MachineInstr &AddrI, + ExtAddrMode &AM) const { + enum MemIOffsetType { + Imm14Shift2, + Imm12, + Imm11Shift1, + Imm10Shift2, + Imm9Shift3, + Imm8, + Imm8Shift1, + Imm8Shift2, + Imm8Shift3 + }; + + MemIOffsetType OT; + switch (MemI.getOpcode()) { + default: + return false; + case LoongArch::LDPTR_W: + case LoongArch::LDPTR_D: + case LoongArch::STPTR_W: + case LoongArch::STPTR_D: + OT = Imm14Shift2; + break; + case LoongArch::LD_B: + case LoongArch::LD_H: + case LoongArch::LD_W: + case LoongArch::LD_D: + case LoongArch::LD_BU: + case LoongArch::LD_HU: + case LoongArch::LD_WU: + case LoongArch::ST_B: + case LoongArch::ST_H: + case LoongArch::ST_W: + case LoongArch::ST_D: + case LoongArch::FLD_S: + case LoongArch::FLD_D: + case LoongArch::FST_S: + case LoongArch::FST_D: + case LoongArch::VLD: + case LoongArch::VST: + case LoongArch::XVLD: + case LoongArch::XVST: + case LoongArch::VLDREPL_B: + case LoongArch::XVLDREPL_B: + OT = Imm12; + break; + case LoongArch::VLDREPL_H: + case LoongArch::XVLDREPL_H: + OT = Imm11Shift1; + break; + case LoongArch::VLDREPL_W: + case LoongArch::XVLDREPL_W: + OT = Imm10Shift2; + break; + case LoongArch::VLDREPL_D: + case LoongArch::XVLDREPL_D: + OT = Imm9Shift3; + break; + case LoongArch::VSTELM_B: + case LoongArch::XVSTELM_B: + OT = Imm8; + break; + case LoongArch::VSTELM_H: + case LoongArch::XVSTELM_H: + OT = Imm8Shift1; + break; + case LoongArch::VSTELM_W: + case LoongArch::XVSTELM_W: + OT = Imm8Shift2; + break; + case LoongArch::VSTELM_D: + case LoongArch::XVSTELM_D: + OT = Imm8Shift3; + break; + } + + if (MemI.getOperand(0).getReg() == Reg) + return false; + + if ((AddrI.getOpcode() != LoongArch::ADDI_W && + AddrI.getOpcode() != LoongArch::ADDI_D) || + !AddrI.getOperand(1).isReg() || !AddrI.getOperand(2).isImm()) + return false; + + int64_t OldOffset = MemI.getOperand(2).getImm(); + int64_t Disp = AddrI.getOperand(2).getImm(); + int64_t NewOffset = OldOffset + Disp; + if (!STI.is64Bit()) + NewOffset = SignExtend64<32>(NewOffset); + + if (!(OT == Imm14Shift2 && isShiftedInt<14, 2>(NewOffset) && STI.hasUAL()) && + !(OT == Imm12 && isInt<12>(NewOffset)) && + !(OT == Imm11Shift1 && isShiftedInt<11, 1>(NewOffset)) && + !(OT == Imm10Shift2 && isShiftedInt<10, 2>(NewOffset)) && + !(OT == Imm9Shift3 && isShiftedInt<9, 3>(NewOffset)) && + !(OT == Imm8 && isInt<8>(NewOffset)) && + !(OT == Imm8Shift1 && isShiftedInt<8, 1>(NewOffset)) && + !(OT == Imm8Shift2 && isShiftedInt<8, 2>(NewOffset)) && + !(OT == Imm8Shift3 && isShiftedInt<8, 3>(NewOffset))) + return false; + + AM.BaseReg = AddrI.getOperand(1).getReg(); + AM.ScaledReg = 0; + AM.Scale = 0; + AM.Displacement = NewOffset; + AM.Form = ExtAddrMode::Formula::Basic; + return true; +} + +MachineInstr * +LoongArchInstrInfo::emitLdStWithAddr(MachineInstr &MemI, + const ExtAddrMode &AM) const { + const DebugLoc &DL = MemI.getDebugLoc(); + MachineBasicBlock &MBB = *MemI.getParent(); + + assert(AM.ScaledReg == 0 && AM.Scale == 0 && + "Addressing mode not supported for folding"); + + unsigned MemIOp = MemI.getOpcode(); + switch (MemIOp) { + default: + return BuildMI(MBB, MemI, DL, get(MemIOp)) + .addReg(MemI.getOperand(0).getReg(), + MemI.mayLoad() ? RegState::Define : 0) + .addReg(AM.BaseReg) + .addImm(AM.Displacement) + .setMemRefs(MemI.memoperands()) + .setMIFlags(MemI.getFlags()); + case LoongArch::VSTELM_B: + case LoongArch::VSTELM_H: + case LoongArch::VSTELM_W: + case LoongArch::VSTELM_D: + case LoongArch::XVSTELM_B: + case LoongArch::XVSTELM_H: + case LoongArch::XVSTELM_W: + case LoongArch::XVSTELM_D: + return BuildMI(MBB, MemI, DL, get(MemIOp)) + .addReg(MemI.getOperand(0).getReg(), 0) + .addReg(AM.BaseReg) + .addImm(AM.Displacement) + .addImm(MemI.getOperand(3).getImm()) + .setMemRefs(MemI.memoperands()) + .setMIFlags(MemI.getFlags()); + } +} + // Returns true if this is the sext.w pattern, addi.w rd, rs, 0. bool LoongArch::isSEXT_W(const MachineInstr &MI) { return MI.getOpcode() == LoongArch::ADDI_W && MI.getOperand(1).isReg() && diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h index f25958a..f69a558 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h @@ -93,6 +93,12 @@ public: ArrayRef<std::pair<unsigned, const char *>> getSerializableBitmaskMachineOperandTargetFlags() const override; + bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, + const MachineInstr &AddrI, + ExtAddrMode &AM) const override; + MachineInstr *emitLdStWithAddr(MachineInstr &MemI, + const ExtAddrMode &AM) const override; + protected: const LoongArchSubtarget &STI; }; diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index 9de4c9d..92a9388 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -62,6 +62,11 @@ static cl::opt<bool> cl::desc("Enable the merge base offset pass"), cl::init(true), cl::Hidden); +static cl::opt<bool> + EnableSinkFold("loongarch-enable-sink-fold", + cl::desc("Enable sinking and folding of instruction copies"), + cl::init(true), cl::Hidden); + static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { return RM.value_or(Reloc::Static); } @@ -146,7 +151,9 @@ namespace { class LoongArchPassConfig : public TargetPassConfig { public: LoongArchPassConfig(LoongArchTargetMachine &TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + : TargetPassConfig(TM, PM) { + setEnableSinkAndFold(EnableSinkFold); + } LoongArchTargetMachine &getLoongArchTargetMachine() const { return getTM<LoongArchTargetMachine>(); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index c3f100e..995ae75 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -16496,32 +16496,42 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG, } static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX, - unsigned ShY) { + unsigned ShY, bool AddX) { SDLoc DL(N); EVT VT = N->getValueType(0); SDValue X = N->getOperand(0); SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, DAG.getTargetConstant(ShY, DL, VT), X); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getTargetConstant(ShX, DL, VT), Mul359); + DAG.getTargetConstant(ShX, DL, VT), AddX ? X : Mul359); } static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG, uint64_t MulAmt) { + // 3/5/9 * 3/5/9 -> (shXadd (shYadd X, X), (shYadd X, X)) switch (MulAmt) { case 5 * 3: - return getShlAddShlAdd(N, DAG, 2, 1); + return getShlAddShlAdd(N, DAG, 2, 1, /*AddX=*/false); case 9 * 3: - return getShlAddShlAdd(N, DAG, 3, 1); + return getShlAddShlAdd(N, DAG, 3, 1, /*AddX=*/false); case 5 * 5: - return getShlAddShlAdd(N, DAG, 2, 2); + return getShlAddShlAdd(N, DAG, 2, 2, /*AddX=*/false); case 9 * 5: - return getShlAddShlAdd(N, DAG, 3, 2); + return getShlAddShlAdd(N, DAG, 3, 2, /*AddX=*/false); case 9 * 9: - return getShlAddShlAdd(N, DAG, 3, 3); + return getShlAddShlAdd(N, DAG, 3, 3, /*AddX=*/false); default: - return SDValue(); + break; } + + // 2/4/8 * 3/5/9 + 1 -> (shXadd (shYadd X, X), X) + int ShX; + if (int ShY = isShifted359(MulAmt - 1, ShX)) { + assert(ShX != 0 && "MulAmt=4,6,10 handled before"); + if (ShX <= 3) + return getShlAddShlAdd(N, DAG, ShX, ShY, /*AddX=*/true); + } + return SDValue(); } // Try to expand a scalar multiply to a faster sequence. @@ -16581,41 +16591,30 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, DAG.getConstant(Shift, DL, VT)); } - // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) - if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt)) - return V; + // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples + // of 25 which happen to be quite common. + // (2/4/8 * 3/5/9 + 1) * 2^N + Shift = llvm::countr_zero(MulAmt); + if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) { + if (Shift == 0) + return V; + SDLoc DL(N); + return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT)); + } // If this is a power 2 + 2/4/8, we can use a shift followed by a single // shXadd. First check if this a sum of two power of 2s because that's // easy. Then count how many zeros are up to the first bit. - if (isPowerOf2_64(MulAmt & (MulAmt - 1))) { - unsigned ScaleShift = llvm::countr_zero(MulAmt); - if (ScaleShift >= 1 && ScaleShift < 4) { - unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1))); - SDLoc DL(N); - SDValue Shift1 = - DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getTargetConstant(ScaleShift, DL, VT), Shift1); - } + if (Shift >= 1 && Shift <= 3 && isPowerOf2_64(MulAmt & (MulAmt - 1))) { + unsigned ShiftAmt = llvm::countr_zero((MulAmt & (MulAmt - 1))); + SDLoc DL(N); + SDValue Shift1 = + DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(Shift, DL, VT), Shift1); } - // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x) - // This is the two instruction form, there are also three instruction - // variants we could implement. e.g. - // (2^(1,2,3) * 3,5,9 + 1) << C2 - // 2^(C1>3) * 3,5,9 +/- 1 - if (int ShXAmount = isShifted359(MulAmt - 1, Shift)) { - assert(Shift != 0 && "MulAmt=4,6,10 handled before"); - if (Shift <= 3) { - SDLoc DL(N); - SDValue Mul359 = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getTargetConstant(ShXAmount, DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getTargetConstant(Shift, DL, VT), X); - } - } + // TODO: 2^(C1>3) * 3,5,9 +/- 1 // 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X)) if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) { @@ -16647,14 +16646,6 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359); } } - - // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples - // of 25 which happen to be quite common. - Shift = llvm::countr_zero(MulAmt); - if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) { - SDLoc DL(N); - return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT)); - } } if (SDValue V = expandMulToAddOrSubOfShl(N, DAG, MulAmt)) diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index 636e31c..bf9de0a 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -1583,7 +1583,10 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) { if (!TII->isAddImmediate(*DeadMI, Reg)) continue; LIS->RemoveMachineInstrFromMaps(*DeadMI); + Register AddReg = DeadMI->getOperand(1).getReg(); DeadMI->eraseFromParent(); + if (AddReg.isVirtual()) + LIS->shrinkToUses(&LIS->getInterval(AddReg)); } } } @@ -1869,11 +1872,15 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const { // Loop over the dead AVL values, and delete them now. This has // to be outside the above loop to avoid invalidating iterators. for (auto *MI : ToDelete) { + assert(MI->getOpcode() == RISCV::ADDI); + Register AddReg = MI->getOperand(1).getReg(); if (LIS) { LIS->removeInterval(MI->getOperand(0).getReg()); LIS->RemoveMachineInstrFromMaps(*MI); } MI->eraseFromParent(); + if (LIS && AddReg.isVirtual()) + LIS->shrinkToUses(&LIS->getInterval(AddReg)); } } diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 56a38bb..b2cbdb2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -2390,6 +2390,15 @@ static bool generateBindlessImageINTELInst(const SPIRV::IncomingCall *Call, return buildBindlessImageINTELInst(Call, Opcode, MIRBuilder, GR); } +static bool generateBlockingPipesInst(const SPIRV::IncomingCall *Call, + MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry *GR) { + const SPIRV::DemangledBuiltin *Builtin = Call->Builtin; + unsigned Opcode = + SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode; + return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0)); +} + static bool generateTernaryBitwiseFunctionINTELInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, @@ -3050,6 +3059,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall, return generatePipeInst(Call.get(), MIRBuilder, GR); case SPIRV::PredicatedLoadStore: return generatePredicatedLoadStoreInst(Call.get(), MIRBuilder, GR); + case SPIRV::BlockingPipes: + return generateBlockingPipesInst(Call.get(), MIRBuilder, GR); } return false; } diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index c259cce..492a98e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -71,6 +71,7 @@ def TernaryBitwiseINTEL : BuiltinGroup; def Block2DLoadStore : BuiltinGroup; def Pipe : BuiltinGroup; def PredicatedLoadStore : BuiltinGroup; +def BlockingPipes : BuiltinGroup; //===----------------------------------------------------------------------===// // Class defining a demangled builtin record. The information in the record @@ -1174,6 +1175,10 @@ defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0 defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; + +//SPV_ALTERA_blocking_pipes +defm : DemangledNativeBuiltin<"__spirv_WritePipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpWritePipeBlockingALTERA>; +defm : DemangledNativeBuiltin<"__spirv_ReadPipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpReadPipeBlockingALTERA>; defm : DemangledNativeBuiltin<"__spirv_ReadClockKHR", OpenCL_std, KernelClock, 1, 1, OpReadClockKHR>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index 43b2869..f681b0d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -159,7 +159,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> {"SPV_KHR_maximal_reconvergence", SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}, {"SPV_INTEL_kernel_attributes", - SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes}}; + SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes}, + {"SPV_ALTERA_blocking_pipes", + SPIRV::Extension::Extension::SPV_ALTERA_blocking_pipes}}; bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName, StringRef ArgValue, diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index a61351e..03bd61b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -993,3 +993,9 @@ def OpPredicatedLoadINTEL: Op<6528, (outs ID:$res), (ins TYPE:$resType, ID:$ptr, "$res = OpPredicatedLoadINTEL $resType $ptr $predicate $default_value">; def OpPredicatedStoreINTEL: Op<6529, (outs), (ins ID:$ptr, ID:$object, ID:$predicate, variable_ops), "OpPredicatedStoreINTEL $ptr $object $predicate">; + +//SPV_ALTERA_blocking_pipes +def OpReadPipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment), + "OpReadPipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">; +def OpWritePipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment), + "OpWritePipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">; diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index e5ac76c4..af76016 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1885,6 +1885,13 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability( SPIRV::Capability::CooperativeMatrixCheckedInstructionsINTEL); break; + case SPIRV::OpReadPipeBlockingALTERA: + case SPIRV::OpWritePipeBlockingALTERA: + if (ST.canUseExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes)) { + Reqs.addExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes); + Reqs.addCapability(SPIRV::Capability::BlockingPipesALTERA); + } + break; case SPIRV::OpCooperativeMatrixGetElementCoordINTEL: if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_joint_matrix)) report_fatal_error("OpCooperativeMatrixGetElementCoordINTEL requires the " diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 4e4e6fb..be88f33 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -56,6 +56,13 @@ public: } }; +static cl::list<std::string> SPVAllowUnknownIntrinsics( + "spv-allow-unknown-intrinsics", cl::CommaSeparated, + cl::desc("Emit unknown intrinsics as calls to external functions. A " + "comma-separated input list of intrinsic prefixes must be " + "provided, and only intrinsics carrying a listed prefix get " + "emitted as described."), + cl::value_desc("intrinsic_prefix_0,intrinsic_prefix_1"), cl::ValueOptional); } // namespace char SPIRVPrepareFunctions::ID = 0; @@ -445,6 +452,15 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { EraseFromParent); Changed = true; break; + default: + if (TM.getTargetTriple().getVendor() == Triple::AMD || + any_of(SPVAllowUnknownIntrinsics, [II](auto &&Prefix) { + if (Prefix.empty()) + return false; + return II->getCalledFunction()->getName().starts_with(Prefix); + })) + Changed |= lowerIntrinsicToFunction(II); + break; } } } diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 1b4b29b..65a8885 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -309,7 +309,7 @@ defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>; defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>; defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>; defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>; -defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>; +defm SPV_ALTERA_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>; defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>; defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>; defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>; @@ -611,6 +611,7 @@ defm TensorFloat32RoundingINTEL : CapabilityOperand<6425, 0, 0, [SPV_INTEL_tenso defm BFloat16TypeKHR : CapabilityOperand<5116, 0, 0, [SPV_KHR_bfloat16], []>; defm BFloat16DotProductKHR : CapabilityOperand<5117, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR]>; defm BFloat16CooperativeMatrixKHR : CapabilityOperand<5118, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR, CooperativeMatrixKHR]>; +defm BlockingPipesALTERA : CapabilityOperand<5945, 0, 0, [SPV_ALTERA_blocking_pipes], []>; //===----------------------------------------------------------------------===// // Multiclass used to define SourceLanguage enum values and at the same time diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index af32298..fc6c290 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -216,7 +216,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( // Combine fp_to_{s,u}int_sat or fp_round of concat_vectors or vice versa // into conversion ops setTargetDAGCombine({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, - ISD::FP_ROUND, ISD::CONCAT_VECTORS}); + ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_ROUND, + ISD::CONCAT_VECTORS}); setTargetDAGCombine(ISD::TRUNCATE); @@ -3580,6 +3581,64 @@ static SDValue performMulCombine(SDNode *N, } } +SDValue DoubleVectorWidth(SDValue In, unsigned RequiredNumElems, + SelectionDAG &DAG) { + SDLoc DL(In); + LLVMContext &Ctx = *DAG.getContext(); + EVT InVT = In.getValueType(); + unsigned NumElems = InVT.getVectorNumElements() * 2; + EVT OutVT = EVT::getVectorVT(Ctx, InVT.getVectorElementType(), NumElems); + SDValue Concat = + DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, In, DAG.getPOISON(InVT)); + if (NumElems < RequiredNumElems) { + return DoubleVectorWidth(Concat, RequiredNumElems, DAG); + } + return Concat; +} + +SDValue performConvertFPCombine(SDNode *N, SelectionDAG &DAG) { + EVT OutVT = N->getValueType(0); + if (!OutVT.isVector()) + return SDValue(); + + EVT OutElTy = OutVT.getVectorElementType(); + if (OutElTy != MVT::i8 && OutElTy != MVT::i16) + return SDValue(); + + unsigned NumElems = OutVT.getVectorNumElements(); + if (!isPowerOf2_32(NumElems)) + return SDValue(); + + EVT FPVT = N->getOperand(0)->getValueType(0); + if (FPVT.getVectorElementType() != MVT::f32) + return SDValue(); + + SDLoc DL(N); + + // First, convert to i32. + LLVMContext &Ctx = *DAG.getContext(); + EVT IntVT = EVT::getVectorVT(Ctx, MVT::i32, NumElems); + SDValue ToInt = DAG.getNode(N->getOpcode(), DL, IntVT, N->getOperand(0)); + APInt Mask = APInt::getLowBitsSet(IntVT.getScalarSizeInBits(), + OutVT.getScalarSizeInBits()); + // Mask out the top MSBs. + SDValue Masked = + DAG.getNode(ISD::AND, DL, IntVT, ToInt, DAG.getConstant(Mask, DL, IntVT)); + + if (OutVT.getSizeInBits() < 128) { + // Create a wide enough vector that we can use narrow. + EVT NarrowedVT = OutElTy == MVT::i8 ? MVT::v16i8 : MVT::v8i16; + unsigned NumRequiredElems = NarrowedVT.getVectorNumElements(); + SDValue WideVector = DoubleVectorWidth(Masked, NumRequiredElems, DAG); + SDValue Trunc = truncateVectorWithNARROW(NarrowedVT, WideVector, DL, DAG); + return DAG.getBitcast( + OutVT, extractSubVector(Trunc, 0, DAG, DL, OutVT.getSizeInBits())); + } else { + return truncateVectorWithNARROW(OutVT, Masked, DL, DAG); + } + return SDValue(); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -3606,6 +3665,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, case ISD::FP_ROUND: case ISD::CONCAT_VECTORS: return performVectorTruncZeroCombine(N, DCI); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return performConvertFPCombine(N, DCI.DAG); case ISD::TRUNCATE: return performTruncateCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4d44227b3..168e041 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53442,7 +53442,8 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, } SDValue NewStore = - DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), + DAG.getStore(St->getChain(), DL, Res, NewPtr, + MachinePointerInfo(St->getPointerInfo().getAddrSpace()), Align(), St->getMemOperand()->getFlags()); // If there are other uses of StoredVal, replace with a new load of the @@ -54639,7 +54640,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, SDValue NewPtr = DAG.getMemBasePlusOffset( Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap); SDValue NewLoad = - DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), + DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, + MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()), Align(), Ld->getMemOperand()->getFlags()); DAG.makeEquivalentMemoryOrdering(Ld, NewLoad); return NewLoad; |
