diff options
Diffstat (limited to 'llvm/lib')
84 files changed, 3042 insertions, 1475 deletions
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 428342f..dd9a44b 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -3670,14 +3670,12 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, const SCEV *SrcEv = SE->getMinusSCEV(SrcSCEV, SrcBase); const SCEV *DstEv = SE->getMinusSCEV(DstSCEV, DstBase); - if (Src != Dst) { - // Check that memory access offsets are multiples of element sizes. - if (!SE->isKnownMultipleOf(SrcEv, EltSize, Assume) || - !SE->isKnownMultipleOf(DstEv, EltSize, Assume)) { - LLVM_DEBUG(dbgs() << "can't analyze SCEV with different offsets\n"); - return std::make_unique<Dependence>(Src, Dst, - SCEVUnionPredicate(Assume, *SE)); - } + // Check that memory access offsets are multiples of element sizes. + if (!SE->isKnownMultipleOf(SrcEv, EltSize, Assume) || + !SE->isKnownMultipleOf(DstEv, EltSize, Assume)) { + LLVM_DEBUG(dbgs() << "can't analyze SCEV with different offsets\n"); + return std::make_unique<Dependence>(Src, Dst, + SCEVUnionPredicate(Assume, *SE)); } if (!Assume.empty()) { diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp index 898bf5b..95f30fd 100644 --- a/llvm/lib/Analysis/IR2Vec.cpp +++ b/llvm/lib/Analysis/IR2Vec.cpp @@ -215,7 +215,7 @@ Vocabulary::Vocabulary(VocabVector &&Vocab) : Vocab(std::move(Vocab)), Valid(true) {} bool Vocabulary::isValid() const { - return Vocab.size() == (MaxOpcodes + MaxTypeIDs + MaxOperandKinds) && Valid; + return Vocab.size() == Vocabulary::expectedSize() && Valid; } size_t Vocabulary::size() const { @@ -324,8 +324,24 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) { return OperandKind::VariableID; } +unsigned Vocabulary::getNumericID(unsigned Opcode) { + assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode"); + return Opcode - 1; // Convert to zero-based index +} + +unsigned Vocabulary::getNumericID(Type::TypeID TypeID) { + assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID"); + return MaxOpcodes + static_cast<unsigned>(TypeID); +} + +unsigned Vocabulary::getNumericID(const Value *Op) { + unsigned Index = static_cast<unsigned>(getOperandKind(Op)); + assert(Index < MaxOperandKinds && "Invalid OperandKind"); + return MaxOpcodes + MaxTypeIDs + Index; +} + StringRef Vocabulary::getStringKey(unsigned Pos) { - assert(Pos < MaxOpcodes + MaxTypeIDs + MaxOperandKinds && + assert(Pos < Vocabulary::expectedSize() && "Position out of bounds in vocabulary"); // Opcode if (Pos < MaxOpcodes) diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 7f0ed0b..1b3da59 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -306,6 +306,15 @@ unsigned llvm::getDeinterleaveIntrinsicFactor(Intrinsic::ID ID) { } } +VectorType *llvm::getDeinterleavedVectorType(IntrinsicInst *DI) { + [[maybe_unused]] unsigned Factor = + getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); + ArrayRef<Type *> DISubtypes = DI->getType()->subtypes(); + assert(Factor && Factor == DISubtypes.size() && + "unexpected deinterleave factor or result type"); + return cast<VectorType>(DISubtypes[0]); +} + /// Given a vector and an element number, see if the scalar value is /// already around as a register, for example if it were inserted then extracted /// from the vector. diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 3922eba..e8f513a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -5300,7 +5300,7 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI, return false; } -MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const { +MachineInstr *CombinerHelper::buildUDivOrURemUsingMul(MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM); auto &UDivorRem = cast<GenericMachineInstr>(MI); @@ -5468,7 +5468,7 @@ MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const { return ret; } -bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const { +bool CombinerHelper::matchUDivOrURemByConst(MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM); Register Dst = MI.getOperand(0).getReg(); @@ -5517,13 +5517,14 @@ bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const { MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } -void CombinerHelper::applyUDivorURemByConst(MachineInstr &MI) const { - auto *NewMI = buildUDivorURemUsingMul(MI); +void CombinerHelper::applyUDivOrURemByConst(MachineInstr &MI) const { + auto *NewMI = buildUDivOrURemUsingMul(MI); replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg()); } -bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { - assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV"); +bool CombinerHelper::matchSDivOrSRemByConst(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + assert(Opcode == TargetOpcode::G_SDIV || Opcode == TargetOpcode::G_SREM); Register Dst = MI.getOperand(0).getReg(); Register RHS = MI.getOperand(2).getReg(); LLT DstTy = MRI.getType(Dst); @@ -5543,7 +5544,8 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { return false; // If the sdiv has an 'exact' flag we can use a simpler lowering. - if (MI.getFlag(MachineInstr::MIFlag::IsExact)) { + if (Opcode == TargetOpcode::G_SDIV && + MI.getFlag(MachineInstr::MIFlag::IsExact)) { return matchUnaryPredicate( MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } @@ -5559,23 +5561,28 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { if (!isLegal({TargetOpcode::G_SMULH, {DstTy}}) && !isLegalOrHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}})) return false; + if (Opcode == TargetOpcode::G_SREM && + !isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}})) + return false; } return matchUnaryPredicate( MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } -void CombinerHelper::applySDivByConst(MachineInstr &MI) const { - auto *NewMI = buildSDivUsingMul(MI); +void CombinerHelper::applySDivOrSRemByConst(MachineInstr &MI) const { + auto *NewMI = buildSDivOrSRemUsingMul(MI); replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg()); } -MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const { - assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV"); - auto &SDiv = cast<GenericMachineInstr>(MI); - Register Dst = SDiv.getReg(0); - Register LHS = SDiv.getReg(1); - Register RHS = SDiv.getReg(2); +MachineInstr *CombinerHelper::buildSDivOrSRemUsingMul(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + assert(MI.getOpcode() == TargetOpcode::G_SDIV || + Opcode == TargetOpcode::G_SREM); + auto &SDivorRem = cast<GenericMachineInstr>(MI); + Register Dst = SDivorRem.getReg(0); + Register LHS = SDivorRem.getReg(1); + Register RHS = SDivorRem.getReg(2); LLT Ty = MRI.getType(Dst); LLT ScalarTy = Ty.getScalarType(); const unsigned EltBits = ScalarTy.getScalarSizeInBits(); @@ -5705,7 +5712,13 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const { auto SignShift = MIB.buildConstant(ShiftAmtTy, EltBits - 1); auto T = MIB.buildLShr(Ty, Q, SignShift); T = MIB.buildAnd(Ty, T, ShiftMask); - return MIB.buildAdd(Ty, Q, T); + auto ret = MIB.buildAdd(Ty, Q, T); + + if (Opcode == TargetOpcode::G_SREM) { + auto Prod = MIB.buildMul(Ty, ret, RHS); + return MIB.buildSub(Ty, LHS, Prod); + } + return ret; } bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) const { diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 9559983..d2b2edf 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -367,34 +367,23 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( bool BinOpShuffleChanged = replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); + Value *Mask = nullptr; if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) { - Value *LaneMask = - getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy)); - if (!LaneMask) + Mask = getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy)); + if (!Mask) return false; - LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n"); - - // Sometimes the number of Shuffles might be less than Factor, we have to - // fill the gaps with null. Also, lowerInterleavedVPLoad - // expects them to be sorted. - SmallVector<Value *, 4> ShuffleValues(Factor, nullptr); - for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices)) - ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx]; - if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues)) - // If Extracts is not empty, tryReplaceExtracts made changes earlier. - return !Extracts.empty() || BinOpShuffleChanged; } else { LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); - - // Try to create target specific intrinsics to replace the load and - // shuffles. - if (!TLI->lowerInterleavedLoad(cast<LoadInst>(Load), Shuffles, Indices, - Factor)) - // If Extracts is not empty, tryReplaceExtracts made changes earlier. - return !Extracts.empty() || BinOpShuffleChanged; } + // Try to create target specific intrinsics to replace the load and + // shuffles. + if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles, + Indices, Factor)) + // If Extracts is not empty, tryReplaceExtracts made changes earlier. + return !Extracts.empty() || BinOpShuffleChanged; + DeadInsts.insert_range(Shuffles); DeadInsts.insert(Load); @@ -618,29 +607,13 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); assert(Factor && "unexpected deinterleave intrinsic"); - SmallVector<Value *, 8> DeinterleaveValues(Factor, nullptr); - Value *LastFactor = nullptr; - for (auto *User : DI->users()) { - auto *Extract = dyn_cast<ExtractValueInst>(User); - if (!Extract || Extract->getNumIndices() != 1) - return false; - unsigned Idx = Extract->getIndices()[0]; - if (DeinterleaveValues[Idx]) - return false; - DeinterleaveValues[Idx] = Extract; - LastFactor = Extract; - } - - if (!LastFactor) - return false; - Value *Mask = nullptr; if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) { if (VPLoad->getIntrinsicID() != Intrinsic::vp_load) return false; // Check mask operand. Handle both all-true/false and interleaved mask. Value *WideMask = VPLoad->getOperand(1); - Mask = getMask(WideMask, Factor, cast<VectorType>(LastFactor->getType())); + Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI)); if (!Mask) return false; @@ -657,12 +630,9 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( // Try and match this with target specific intrinsics. if (!TLI->lowerDeinterleaveIntrinsicToLoad(cast<Instruction>(LoadedVal), Mask, - DeinterleaveValues)) + DI)) return false; - for (Value *V : DeinterleaveValues) - if (V) - DeadInsts.insert(cast<Instruction>(V)); DeadInsts.insert(DI); // We now have a target-specific load, so delete the old one. DeadInsts.insert(cast<Instruction>(LoadedVal)); @@ -681,23 +651,19 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( const unsigned Factor = getInterleaveIntrinsicFactor(II->getIntrinsicID()); assert(Factor && "unexpected interleave intrinsic"); + Value *Mask = nullptr; if (auto *VPStore = dyn_cast<VPIntrinsic>(StoredBy)) { if (VPStore->getIntrinsicID() != Intrinsic::vp_store) return false; Value *WideMask = VPStore->getOperand(2); - Value *Mask = getMask(WideMask, Factor, - cast<VectorType>(InterleaveValues[0]->getType())); + Mask = getMask(WideMask, Factor, + cast<VectorType>(InterleaveValues[0]->getType())); if (!Mask) return false; LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic " << *II << " and factor = " << Factor << "\n"); - - // Since lowerInterleavedStore expects Shuffle and StoreInst, use special - // TLI function to emit target-specific interleaved instruction. - if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues)) - return false; } else { auto *SI = cast<StoreInst>(StoredBy); if (!SI->isSimple()) @@ -705,12 +671,13 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II << " and factor = " << Factor << "\n"); - - // Try and match this with target specific intrinsics. - if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues)) - return false; } + // Try and match this with target specific intrinsics. + if (!TLI->lowerInterleaveIntrinsicToStore(cast<Instruction>(StoredBy), Mask, + InterleaveValues)) + return false; + // We now have a target-specific store, so delete the old one. DeadInsts.insert(cast<Instruction>(StoredBy)); DeadInsts.insert(II); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 0e8e4c9..40464e9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -609,6 +609,8 @@ namespace { SDValue foldABSToABD(SDNode *N, const SDLoc &DL); SDValue foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode CC, const SDLoc &DL); + SDValue foldSelectToUMin(SDValue LHS, SDValue RHS, SDValue True, + SDValue False, ISD::CondCode CC, const SDLoc &DL); SDValue unfoldMaskedMerge(SDNode *N); SDValue unfoldExtremeBitClearingToShifts(SDNode *N); SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, @@ -859,7 +861,7 @@ namespace { auto LK = TLI.getTypeConversion(*DAG.getContext(), VT); return (LK.first == TargetLoweringBase::TypeLegal || LK.first == TargetLoweringBase::TypePromoteInteger) && - TLI.isOperationLegal(ISD::UMIN, LK.second); + TLI.isOperationLegalOrCustom(ISD::UMIN, LK.second); } public: @@ -4093,6 +4095,26 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { return N0; } + // (sub x, ([v]select (ult x, y), 0, y)) -> (umin x, (sub x, y)) + // (sub x, ([v]select (uge x, y), y, 0)) -> (umin x, (sub x, y)) + if (N1.hasOneUse() && hasUMin(VT)) { + SDValue Y; + if (sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y), + m_SpecificCondCode(ISD::SETULT)), + m_Zero(), m_Deferred(Y))) || + sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y), + m_SpecificCondCode(ISD::SETUGE)), + m_Deferred(Y), m_Zero())) || + sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y), + m_SpecificCondCode(ISD::SETULT)), + m_Zero(), m_Deferred(Y))) || + sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y), + m_SpecificCondCode(ISD::SETUGE)), + m_Deferred(Y), m_Zero()))) + return DAG.getNode(ISD::UMIN, DL, VT, N0, + DAG.getNode(ISD::SUB, DL, VT, N0, Y)); + } + if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -4442,20 +4464,6 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { sd_match(N1, m_UMaxLike(m_Specific(A), m_Specific(B)))) return DAG.getNegative(DAG.getNode(ISD::ABDU, DL, VT, A, B), DL, VT); - // (sub x, (select (ult x, y), 0, y)) -> (umin x, (sub x, y)) - // (sub x, (select (uge x, y), y, 0)) -> (umin x, (sub x, y)) - if (hasUMin(VT)) { - SDValue Y; - if (sd_match(N1, m_OneUse(m_Select(m_SetCC(m_Specific(N0), m_Value(Y), - m_SpecificCondCode(ISD::SETULT)), - m_Zero(), m_Deferred(Y)))) || - sd_match(N1, m_OneUse(m_Select(m_SetCC(m_Specific(N0), m_Value(Y), - m_SpecificCondCode(ISD::SETUGE)), - m_Deferred(Y), m_Zero())))) - return DAG.getNode(ISD::UMIN, DL, VT, N0, - DAG.getNode(ISD::SUB, DL, VT, N0, Y)); - } - return SDValue(); } @@ -12173,6 +12181,30 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True, return SDValue(); } +// ([v]select (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x) +// ([v]select (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C)) +SDValue DAGCombiner::foldSelectToUMin(SDValue LHS, SDValue RHS, SDValue True, + SDValue False, ISD::CondCode CC, + const SDLoc &DL) { + APInt C; + EVT VT = True.getValueType(); + if (sd_match(RHS, m_ConstInt(C)) && hasUMin(VT)) { + if (CC == ISD::SETUGT && LHS == False && + sd_match(True, m_Add(m_Specific(False), m_SpecificInt(~C)))) { + SDValue AddC = DAG.getConstant(~C, DL, VT); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, False, AddC); + return DAG.getNode(ISD::UMIN, DL, VT, Add, False); + } + if (CC == ISD::SETULT && LHS == True && + sd_match(False, m_Add(m_Specific(True), m_SpecificInt(-C)))) { + SDValue AddC = DAG.getConstant(-C, DL, VT); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, True, AddC); + return DAG.getNode(ISD::UMIN, DL, VT, True, Add); + } + } + return SDValue(); +} + SDValue DAGCombiner::visitSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -12358,24 +12390,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { // (select (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x) // (select (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C)) - APInt C; - if (sd_match(Cond1, m_ConstInt(C)) && hasUMin(VT)) { - if (CC == ISD::SETUGT && Cond0 == N2 && - sd_match(N1, m_Add(m_Specific(N2), m_SpecificInt(~C)))) { - // The resulting code relies on an unsigned wrap in ADD. - // Recreating ADD to drop possible nuw/nsw flags. - SDValue AddC = DAG.getConstant(~C, DL, VT); - SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N2, AddC); - return DAG.getNode(ISD::UMIN, DL, VT, Add, N2); - } - if (CC == ISD::SETULT && Cond0 == N1 && - sd_match(N2, m_Add(m_Specific(N1), m_SpecificInt(-C)))) { - // Ditto. - SDValue AddC = DAG.getConstant(-C, DL, VT); - SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, AddC); - return DAG.getNode(ISD::UMIN, DL, VT, N1, Add); - } - } + if (SDValue UMin = foldSelectToUMin(Cond0, Cond1, N1, N2, CC, DL)) + return UMin; } if (!VT.isVector()) @@ -13412,6 +13428,11 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { } } } + + // (vselect (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x) + // (vselect (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C)) + if (SDValue UMin = foldSelectToUMin(LHS, RHS, N1, N2, CC, DL)) + return UMin; } if (SimplifySelectOps(N, N1, N2)) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 70a39ea..682d93d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -13872,6 +13872,8 @@ void SelectionDAG::copyExtraInfo(SDNode *From, SDNode *To) { return; } + const SDNode *EntrySDN = getEntryNode().getNode(); + // We need to copy NodeExtraInfo to all _new_ nodes that are being introduced // through the replacement of From with To. Otherwise, replacements of a node // (From) with more complex nodes (To and its operands) may result in lost @@ -13903,9 +13905,14 @@ void SelectionDAG::copyExtraInfo(SDNode *From, SDNode *To) { return true; if (!Visited.insert(N).second) return true; - if (getEntryNode().getNode() == N) + if (EntrySDN == N) return false; for (const SDValue &Op : N->op_values()) { + if (N == To && Op.getNode() == EntrySDN) { + // Special case: New node's operand is the entry node; just need to + // copy extra info to new node. + break; + } if (!Self(Self, Op.getNode())) return false; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 74c14ed..01e5312 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -845,16 +846,13 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, static void failForInvalidBundles(const CallBase &I, StringRef Name, ArrayRef<uint32_t> AllowedBundles) { if (I.hasOperandBundlesOtherThan(AllowedBundles)) { + ListSeparator LS; std::string Error; + raw_string_ostream OS(Error); for (unsigned i = 0, e = I.getNumOperandBundles(); i != e; ++i) { OperandBundleUse U = I.getOperandBundleAt(i); - bool First = true; - if (is_contained(AllowedBundles, U.getTagID())) - continue; - if (!First) - Error += ", "; - First = false; - Error += U.getTagName(); + if (!is_contained(AllowedBundles, U.getTagID())) + OS << LS << U.getTagName(); } reportFatalUsageError( Twine("cannot lower ", Name) diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index 9cc9af8..b79911b 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -731,8 +731,8 @@ BasicBlock *CreateFailBB(Function *F, const TargetLowering &TLI) { } if (StackChkFail) { - cast<Function>(StackChkFail.getCallee())->addFnAttr(Attribute::NoReturn); - B.CreateCall(StackChkFail, Args); + CallInst *Call = B.CreateCall(StackChkFail, Args); + Call->addFnAttr(Attribute::NoReturn); } B.CreateUnreachable(); diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 518a933..18d6bbc 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -792,12 +792,18 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, const MachineOperand &MO = MI.getOperand(1 - Ops[0]); MachineBasicBlock::iterator Pos = MI; - - if (Flags == MachineMemOperand::MOStore) - storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI, - Register()); - else + if (Flags == MachineMemOperand::MOStore) { + if (MO.isUndef()) { + // If this is an undef copy, we do not need to bother we inserting spill + // code. + BuildMI(*MBB, Pos, MI.getDebugLoc(), get(TargetOpcode::KILL)).add(MO); + } else { + storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI, + Register()); + } + } else loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI, Register()); + return &*--Pos; } diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp index 222dc88..559d808 100644 --- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp @@ -43,6 +43,12 @@ namespace llvm { using namespace dwarf_linker; using namespace dwarf_linker::classic; +enum InvalidStmtSeqOffset { + MaxStmtSeqOffset = UINT64_MAX, + OrigOffsetMissing = MaxStmtSeqOffset - 1, + NewOffsetMissing = MaxStmtSeqOffset - 2, +}; + /// Hold the input and output of the debug info size in bytes. struct DebugInfoSize { uint64_t Input; @@ -2315,7 +2321,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) { // Some sequences are discarded by the DWARFLinker if they are invalid // (empty). if (OrigRowIter == SeqOffToOrigRow.end()) { - StmtSeq.set(UINT64_MAX); + StmtSeq.set(OrigOffsetMissing); continue; } size_t OrigRowIndex = OrigRowIter->second; @@ -2325,7 +2331,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) { if (NewRowIter == OrigRowToNewRow.end()) { // If the original row index is not found in the map, update the // stmt_sequence attribute to the 'invalid offset' magic value. - StmtSeq.set(UINT64_MAX); + StmtSeq.set(NewOffsetMissing); continue; } diff --git a/llvm/lib/IR/OptBisect.cpp b/llvm/lib/IR/OptBisect.cpp index 427e8b7..29ca268 100644 --- a/llvm/lib/IR/OptBisect.cpp +++ b/llvm/lib/IR/OptBisect.cpp @@ -25,6 +25,11 @@ static OptBisect &getOptBisector() { return OptBisector; } +static OptDisable &getOptDisabler() { + static OptDisable OptDisabler; + return OptDisabler; +} + static cl::opt<int> OptBisectLimit("opt-bisect-limit", cl::Hidden, cl::init(OptBisect::Disabled), cl::Optional, cl::cb<void, int>([](int Limit) { @@ -37,6 +42,18 @@ static cl::opt<bool> OptBisectVerbose( cl::desc("Show verbose output when opt-bisect-limit is set"), cl::Hidden, cl::init(true), cl::Optional); +static cl::list<std::string> OptDisablePasses( + "opt-disable", cl::Hidden, cl::CommaSeparated, cl::Optional, + cl::cb<void, std::string>([](const std::string &Pass) { + getOptDisabler().setDisabled(Pass); + }), + cl::desc("Optimization pass(es) to disable (comma-separated list)")); + +static cl::opt<bool> + OptDisableVerbose("opt-disable-enable-verbosity", + cl::desc("Show verbose output when opt-disable is set"), + cl::Hidden, cl::init(false), cl::Optional); + static void printPassMessage(StringRef Name, int PassNum, StringRef TargetDesc, bool Running) { StringRef Status = Running ? "" : "NOT "; @@ -55,4 +72,27 @@ bool OptBisect::shouldRunPass(StringRef PassName, return ShouldRun; } -OptPassGate &llvm::getGlobalPassGate() { return getOptBisector(); } +static void printDisablePassMessage(const StringRef &Name, StringRef TargetDesc, + bool Running) { + StringRef Status = Running ? "" : "NOT "; + dbgs() << "OptDisable: " << Status << "running pass " << Name << " on " + << TargetDesc << "\n"; +} + +void OptDisable::setDisabled(StringRef Pass) { DisabledPasses.insert(Pass); } + +bool OptDisable::shouldRunPass(StringRef PassName, + StringRef IRDescription) const { + assert(isEnabled()); + + const bool ShouldRun = !DisabledPasses.contains(PassName); + if (OptDisableVerbose) + printDisablePassMessage(PassName, IRDescription, ShouldRun); + return ShouldRun; +} + +OptPassGate &llvm::getGlobalPassGate() { + if (getOptDisabler().isEnabled()) + return getOptDisabler(); + return getOptBisector(); +} diff --git a/llvm/lib/IR/Pass.cpp b/llvm/lib/IR/Pass.cpp index 2c5ef71..dec7c9a 100644 --- a/llvm/lib/IR/Pass.cpp +++ b/llvm/lib/IR/Pass.cpp @@ -62,8 +62,12 @@ static std::string getDescription(const Module &M) { bool ModulePass::skipModule(const Module &M) const { const OptPassGate &Gate = M.getContext().getOptPassGate(); - return Gate.isEnabled() && - !Gate.shouldRunPass(this->getPassName(), getDescription(M)); + + StringRef PassName = getPassArgument(); + if (PassName.empty()) + PassName = this->getPassName(); + + return Gate.isEnabled() && !Gate.shouldRunPass(PassName, getDescription(M)); } bool Pass::mustPreserveAnalysisID(char &AID) const { @@ -86,6 +90,16 @@ StringRef Pass::getPassName() const { return "Unnamed pass: implement Pass::getPassName()"; } +/// getPassArgument - Return a nice clean name for a pass +/// corresponding to that used to enable the pass in opt +StringRef Pass::getPassArgument() const { + AnalysisID AID = getPassID(); + const PassInfo *PI = Pass::lookupPassInfo(AID); + if (PI) + return PI->getPassArgument(); + return ""; +} + void Pass::preparePassManager(PMStack &) { // By default, don't do anything. } @@ -173,8 +187,12 @@ static std::string getDescription(const Function &F) { bool FunctionPass::skipFunction(const Function &F) const { OptPassGate &Gate = F.getContext().getOptPassGate(); - if (Gate.isEnabled() && - !Gate.shouldRunPass(this->getPassName(), getDescription(F))) + + StringRef PassName = getPassArgument(); + if (PassName.empty()) + PassName = this->getPassName(); + + if (Gate.isEnabled() && !Gate.shouldRunPass(PassName, getDescription(F))) return true; if (F.hasOptNone()) { diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index b1864897..5936ac7 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -135,6 +135,51 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, } } +RTLIB::LibcallImpl +RuntimeLibcallsInfo::getSupportedLibcallImpl(StringRef FuncName) const { + const ArrayRef<uint16_t> RuntimeLibcallNameOffsets( + RuntimeLibcallNameOffsetTable); + + iterator_range<ArrayRef<uint16_t>::const_iterator> Range = + getRecognizedLibcallImpls(FuncName); + + for (auto I = Range.begin(); I != Range.end(); ++I) { + RTLIB::LibcallImpl Impl = + static_cast<RTLIB::LibcallImpl>(I - RuntimeLibcallNameOffsets.begin()); + + // FIXME: This should not depend on looking up ImplToLibcall, only the list + // of libcalls for the module. + RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]]; + if (Recognized != RTLIB::Unsupported) + return Recognized; + } + + return RTLIB::Unsupported; +} + +iterator_range<ArrayRef<uint16_t>::const_iterator> +RuntimeLibcallsInfo::getRecognizedLibcallImpls(StringRef FuncName) { + StringTable::Iterator It = lower_bound(RuntimeLibcallImplNameTable, FuncName); + if (It == RuntimeLibcallImplNameTable.end() || *It != FuncName) + return iterator_range(ArrayRef<uint16_t>()); + + uint16_t IndexVal = It.offset().value(); + const ArrayRef<uint16_t> TableRef(RuntimeLibcallNameOffsetTable); + + ArrayRef<uint16_t>::const_iterator E = TableRef.end(); + ArrayRef<uint16_t>::const_iterator EntriesBegin = + std::lower_bound(TableRef.begin(), E, IndexVal); + ArrayRef<uint16_t>::const_iterator EntriesEnd = EntriesBegin; + + while (EntriesEnd != E && *EntriesEnd == IndexVal) + ++EntriesEnd; + + assert(EntriesBegin != E && + "libcall found in name table but not offset table"); + + return make_range(EntriesBegin, EntriesEnd); +} + bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) { switch (TT.getOS()) { case Triple::MacOSX: diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index 3a330db..67c53e0 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -407,9 +407,8 @@ public: const MCPseudoProbeInlineStack &InlineStack, MCSymbol *FnSym) override; - std::optional<std::pair<bool, std::string>> - emitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr, - SMLoc Loc, const MCSubtargetInfo &STI) override; + void emitRelocDirective(const MCExpr &Offset, StringRef Name, + const MCExpr *Expr, SMLoc Loc) override; void emitAddrsig() override; void emitAddrsigSym(const MCSymbol *Sym) override; @@ -2468,10 +2467,8 @@ void MCAsmStreamer::emitPseudoProbe(uint64_t Guid, uint64_t Index, EmitEOL(); } -std::optional<std::pair<bool, std::string>> -MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, - const MCExpr *Expr, SMLoc, - const MCSubtargetInfo &STI) { +void MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, + const MCExpr *Expr, SMLoc) { OS << "\t.reloc "; MAI->printExpr(OS, Offset); OS << ", " << Name; @@ -2480,7 +2477,6 @@ MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, MAI->printExpr(OS, *Expr); } EmitEOL(); - return std::nullopt; } void MCAsmStreamer::emitAddrsig() { diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index d4d10e0..3e96bdf 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -398,6 +398,10 @@ bool MCAssembler::registerSymbol(const MCSymbol &Symbol) { return Changed; } +void MCAssembler::addRelocDirective(RelocDirective RD) { + relocDirectives.push_back(RD); +} + /// Write the fragment \p F to the output file. static void writeFragment(raw_ostream &OS, const MCAssembler &Asm, const MCFragment &F) { @@ -695,6 +699,27 @@ void MCAssembler::layout() { // helps check whether a PC-relative fixup is fully resolved. this->HasFinalLayout = true; + // Resolve .reloc offsets and add fixups. + for (auto &PF : relocDirectives) { + MCValue Res; + auto &O = PF.Offset; + if (!O.evaluateAsValue(Res, *this)) { + getContext().reportError(O.getLoc(), ".reloc offset is not relocatable"); + continue; + } + auto *Sym = Res.getAddSym(); + auto *F = Sym ? Sym->getFragment() : nullptr; + auto *Sec = F ? F->getParent() : nullptr; + if (Res.getSubSym() || !Sec) { + getContext().reportError(O.getLoc(), + ".reloc offset is not relative to a section"); + continue; + } + + uint64_t Offset = Sym ? Sym->getOffset() + Res.getConstant() : 0; + F->addFixup(MCFixup::create(Offset, PF.Expr, PF.Kind)); + } + // Evaluate and apply the fixups, generating relocation entries as necessary. for (MCSection &Sec : *this) { for (MCFragment &F : Sec) { @@ -710,13 +735,17 @@ void MCAssembler::layout() { // In the variable part, fixup offsets are relative to the fixed part's // start. Extend the variable contents to the left to account for the // fixed part size. - Contents = MutableArrayRef(F.getParent()->ContentStorage) - .slice(F.VarContentStart - Contents.size(), F.getSize()); - for (MCFixup &Fixup : F.getVarFixups()) { - uint64_t FixedValue; - MCValue Target; - evaluateFixup(F, Fixup, Target, FixedValue, - /*RecordReloc=*/true, Contents); + auto VarFixups = F.getVarFixups(); + if (VarFixups.size()) { + Contents = + MutableArrayRef(F.getParent()->ContentStorage) + .slice(F.VarContentStart - Contents.size(), F.getSize()); + for (MCFixup &Fixup : VarFixups) { + uint64_t FixedValue; + MCValue Target; + evaluateFixup(F, Fixup, Target, FixedValue, + /*RecordReloc=*/true, Contents); + } } } else if (auto *AF = dyn_cast<MCAlignFragment>(&F)) { // For RISC-V linker relaxation, an alignment relocation might be diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp index 070be62..12b3fba 100644 --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -734,9 +734,8 @@ MCSectionGOFF *MCContext::getGOFFSection(SectionKind Kind, StringRef Name, UniqueName.append("/").append(P->getName()); } // Do the lookup. If we don't have a hit, return a new section. - auto IterBool = GOFFUniquingMap.insert(std::make_pair(UniqueName, nullptr)); - auto Iter = IterBool.first; - if (!IterBool.second) + auto [Iter, Inserted] = GOFFUniquingMap.try_emplace(UniqueName); + if (!Inserted) return Iter->second; StringRef CachedName = StringRef(Iter->first.c_str(), Name.size()); diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp index ffc5722..49071bd 100644 --- a/llvm/lib/MC/MCELFStreamer.cpp +++ b/llvm/lib/MC/MCELFStreamer.cpp @@ -314,8 +314,9 @@ void MCELFStreamer::emitIdent(StringRef IdentString) { popSection(); } -void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE, - uint64_t Offset) { +void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *Sym, + uint64_t Offset, + const MCSymbolRefExpr *&SRE) { const MCSymbol *S = &SRE->getSymbol(); if (S->isTemporary()) { if (!S->isInSection()) { @@ -328,13 +329,9 @@ void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE, S->setUsedInReloc(); SRE = MCSymbolRefExpr::create(S, getContext(), SRE->getLoc()); } - const MCConstantExpr *MCOffset = MCConstantExpr::create(Offset, getContext()); - if (std::optional<std::pair<bool, std::string>> Err = - MCObjectStreamer::emitRelocDirective( - *MCOffset, "BFD_RELOC_NONE", SRE, SRE->getLoc(), - *getContext().getSubtargetInfo())) - report_fatal_error("Relocation for CG Profile could not be created: " + - Twine(Err->second)); + auto *O = MCBinaryExpr::createAdd( + Sym, MCConstantExpr::create(Offset, getContext()), getContext()); + MCObjectStreamer::emitRelocDirective(*O, "BFD_RELOC_NONE", SRE); } void MCELFStreamer::finalizeCGProfile() { @@ -347,9 +344,11 @@ void MCELFStreamer::finalizeCGProfile() { pushSection(); switchSection(CGProfile); uint64_t Offset = 0; + auto *Sym = + MCSymbolRefExpr::create(CGProfile->getBeginSymbol(), getContext()); for (auto &E : W.getCGProfile()) { - finalizeCGProfileEntry(E.From, Offset); - finalizeCGProfileEntry(E.To, Offset); + finalizeCGProfileEntry(Sym, Offset, E.From); + finalizeCGProfileEntry(Sym, Offset, E.To); emitIntValue(E.Count, sizeof(uint64_t)); Offset += sizeof(uint64_t); } diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index c0cef0f..67433f2 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -46,35 +46,6 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() { return nullptr; } -// When fixup's offset is a forward declared label, e.g.: -// -// .reloc 1f, R_MIPS_JALR, foo -// 1: nop -// -// postpone adding it to Fixups vector until the label is defined and its offset -// is known. -void MCObjectStreamer::resolvePendingFixups() { - for (PendingMCFixup &PendingFixup : PendingFixups) { - if (!PendingFixup.Sym || PendingFixup.Sym->isUndefined ()) { - getContext().reportError(PendingFixup.Fixup.getLoc(), - "unresolved relocation offset"); - continue; - } - PendingFixup.Fixup.setOffset(PendingFixup.Sym->getOffset() + - PendingFixup.Fixup.getOffset()); - - // If the location symbol to relocate is in MCEncodedFragment, - // put the Fixup into location symbol's fragment. Otherwise - // put into PendingFixup.DF - MCFragment *F = PendingFixup.Sym->getFragment(); - if (F->isEncoded()) - F->addFixup(PendingFixup.Fixup); - else - PendingFixup.DF->addFixup(PendingFixup.Fixup); - } - PendingFixups.clear(); -} - // As a compile-time optimization, avoid allocating and evaluating an MCExpr // tree for (Hi - Lo) when Hi and Lo are offsets into the same fragment's fixed // part. @@ -607,76 +578,14 @@ void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset, insert(getContext().allocFragment<MCOrgFragment>(*Offset, Value, Loc)); } -static std::optional<std::pair<bool, std::string>> -getOffsetAndDataFragment(const MCSymbol &Symbol, uint32_t &RelocOffset, - MCFragment *&DF) { - if (Symbol.isVariable()) { - const MCExpr *SymbolExpr = Symbol.getVariableValue(); - MCValue OffsetVal; - if (!SymbolExpr->evaluateAsRelocatable(OffsetVal, nullptr)) - return std::make_pair(false, - std::string("symbol in .reloc offset is not " - "relocatable")); - if (OffsetVal.isAbsolute()) { - RelocOffset = OffsetVal.getConstant(); - MCFragment *Fragment = Symbol.getFragment(); - // FIXME Support symbols with no DF. For example: - // .reloc .data, ENUM_VALUE, <some expr> - if (!Fragment || Fragment->getKind() != MCFragment::FT_Data) - return std::make_pair(false, - std::string("symbol in offset has no data " - "fragment")); - DF = cast<MCFragment>(Fragment); - return std::nullopt; - } - - if (OffsetVal.getSubSym()) - return std::make_pair(false, - std::string(".reloc symbol offset is not " - "representable")); - - const MCSymbol &SA = *OffsetVal.getAddSym(); - if (!SA.isDefined()) - return std::make_pair(false, - std::string("symbol used in the .reloc offset is " - "not defined")); - - if (SA.isVariable()) - return std::make_pair(false, - std::string("symbol used in the .reloc offset is " - "variable")); - - MCFragment *Fragment = SA.getFragment(); - // FIXME Support symbols with no DF. For example: - // .reloc .data, ENUM_VALUE, <some expr> - if (!Fragment || Fragment->getKind() != MCFragment::FT_Data) - return std::make_pair(false, - std::string("symbol in offset has no data " - "fragment")); - RelocOffset = SA.getOffset() + OffsetVal.getConstant(); - DF = cast<MCFragment>(Fragment); - } else { - RelocOffset = Symbol.getOffset(); - MCFragment *Fragment = Symbol.getFragment(); - // FIXME Support symbols with no DF. For example: - // .reloc .data, ENUM_VALUE, <some expr> - if (!Fragment || Fragment->getKind() != MCFragment::FT_Data) - return std::make_pair(false, - std::string("symbol in offset has no data " - "fragment")); - DF = cast<MCFragment>(Fragment); - } - return std::nullopt; -} - -std::optional<std::pair<bool, std::string>> -MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, - const MCExpr *Expr, SMLoc Loc, - const MCSubtargetInfo &STI) { +void MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, + const MCExpr *Expr, SMLoc Loc) { std::optional<MCFixupKind> MaybeKind = Assembler->getBackend().getFixupKind(Name); - if (!MaybeKind) - return std::make_pair(true, std::string("unknown relocation name")); + if (!MaybeKind) { + getContext().reportError(Loc, "unknown relocation name"); + return; + } MCFixupKind Kind = *MaybeKind; if (Expr) @@ -685,38 +594,14 @@ MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, Expr = MCSymbolRefExpr::create(getContext().createTempSymbol(), getContext()); - MCFragment *DF = getOrCreateDataFragment(&STI); - MCValue OffsetVal; - if (!Offset.evaluateAsRelocatable(OffsetVal, nullptr)) - return std::make_pair(false, - std::string(".reloc offset is not relocatable")); - if (OffsetVal.isAbsolute()) { - if (OffsetVal.getConstant() < 0) - return std::make_pair(false, std::string(".reloc offset is negative")); - DF->addFixup(MCFixup::create(OffsetVal.getConstant(), Expr, Kind)); - return std::nullopt; - } - if (OffsetVal.getSubSym()) - return std::make_pair(false, - std::string(".reloc offset is not representable")); - - const MCSymbol &Symbol = *OffsetVal.getAddSym(); - if (Symbol.isDefined()) { - uint32_t SymbolOffset = 0; - std::optional<std::pair<bool, std::string>> Error = - getOffsetAndDataFragment(Symbol, SymbolOffset, DF); - - if (Error != std::nullopt) - return Error; - - DF->addFixup( - MCFixup::create(SymbolOffset + OffsetVal.getConstant(), Expr, Kind)); - return std::nullopt; + auto *O = &Offset; + int64_t Val; + if (Offset.evaluateAsAbsolute(Val, nullptr)) { + auto *SecSym = getCurrentSectionOnly()->getBeginSymbol(); + O = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(SecSym, getContext()), + O, getContext(), Loc); } - - PendingFixups.emplace_back( - &Symbol, DF, MCFixup::create(OffsetVal.getConstant(), Expr, Kind)); - return std::nullopt; + getAssembler().addRelocDirective({*O, Expr, Kind}); } void MCObjectStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue, @@ -799,6 +684,5 @@ void MCObjectStreamer::finishImpl() { // Emit pseudo probes for the current module. MCPseudoProbeTable::emit(this); - resolvePendingFixups(); getAssembler().Finish(); } diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index a36b2de..77bf843 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -3079,7 +3079,6 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) { bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) { const MCExpr *Offset; const MCExpr *Expr = nullptr; - SMLoc OffsetLoc = Lexer.getTok().getLoc(); if (parseExpression(Offset)) return true; @@ -3105,13 +3104,7 @@ bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) { if (parseEOL()) return true; - const MCTargetAsmParser &MCT = getTargetParser(); - const MCSubtargetInfo &STI = MCT.getSTI(); - if (std::optional<std::pair<bool, std::string>> Err = - getStreamer().emitRelocDirective(*Offset, Name, Expr, DirectiveLoc, - STI)) - return Error(Err->first ? NameLoc : OffsetLoc, Err->second); - + getStreamer().emitRelocDirective(*Offset, Name, Expr, NameLoc); return false; } diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp index 2579fa3..79eeb08 100644 --- a/llvm/lib/Object/IRSymtab.cpp +++ b/llvm/lib/Object/IRSymtab.cpp @@ -54,6 +54,11 @@ static const char *PreservedSymbols[] = { "__stack_chk_guard", }; +static bool isPreservedGlobalVarName(StringRef Name) { + return StringRef(PreservedSymbols[0]) == Name || + StringRef(PreservedSymbols[1]) == Name; +} + namespace { const char *getExpectedProducerName() { @@ -81,12 +86,16 @@ struct Builder { // The StringTableBuilder does not create a copy of any strings added to it, // so this provides somewhere to store any strings that we create. Builder(SmallVector<char, 0> &Symtab, StringTableBuilder &StrtabBuilder, - BumpPtrAllocator &Alloc) - : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc) {} + BumpPtrAllocator &Alloc, const Triple &TT) + : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc), TT(TT), + Libcalls(TT) {} DenseMap<const Comdat *, int> ComdatMap; Mangler Mang; - Triple TT; + const Triple &TT; + + // FIXME: This shouldn't be here. + RTLIB::RuntimeLibcallsInfo Libcalls; std::vector<storage::Comdat> Comdats; std::vector<storage::Module> Mods; @@ -98,6 +107,10 @@ struct Builder { std::vector<storage::Str> DependentLibraries; + bool isPreservedLibFuncName(StringRef Name) { + return Libcalls.getSupportedLibcallImpl(Name) != RTLIB::Unsupported; + } + void setStr(storage::Str &S, StringRef Value) { S.Offset = StrtabBuilder.add(Value); S.Size = Value.size(); @@ -213,18 +226,6 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) { return P.first->second; } -static DenseSet<StringRef> buildPreservedSymbolsSet(const Triple &TT) { - DenseSet<StringRef> PreservedSymbolSet(std::begin(PreservedSymbols), - std::end(PreservedSymbols)); - // FIXME: Do we need to pass in ABI fields from TargetOptions? - RTLIB::RuntimeLibcallsInfo Libcalls(TT); - for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) { - if (Impl != RTLIB::Unsupported) - PreservedSymbolSet.insert(Libcalls.getLibcallImplName(Impl)); - } - return PreservedSymbolSet; -} - Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, const SmallPtrSet<GlobalValue *, 4> &Used, ModuleSymbolTable::Symbol Msym) { @@ -278,13 +279,11 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, return Error::success(); } - setStr(Sym.IRName, GV->getName()); - - static const DenseSet<StringRef> PreservedSymbolsSet = - buildPreservedSymbolsSet(GV->getParent()->getTargetTriple()); - bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName()); + StringRef GVName = GV->getName(); + setStr(Sym.IRName, GVName); - if (Used.count(GV) || IsPreservedSymbol) + if (Used.count(GV) || isPreservedLibFuncName(GVName) || + isPreservedGlobalVarName(GVName)) Sym.Flags |= 1 << storage::Symbol::FB_used; if (GV->isThreadLocal()) Sym.Flags |= 1 << storage::Symbol::FB_tls; @@ -351,7 +350,6 @@ Error Builder::build(ArrayRef<Module *> IRMods) { setStr(Hdr.Producer, kExpectedProducerName); setStr(Hdr.TargetTriple, IRMods[0]->getTargetTriple().str()); setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName()); - TT = IRMods[0]->getTargetTriple(); for (auto *M : IRMods) if (Error Err = addModule(M)) @@ -377,7 +375,8 @@ Error Builder::build(ArrayRef<Module *> IRMods) { Error irsymtab::build(ArrayRef<Module *> Mods, SmallVector<char, 0> &Symtab, StringTableBuilder &StrtabBuilder, BumpPtrAllocator &Alloc) { - return Builder(Symtab, StrtabBuilder, Alloc).build(Mods); + const Triple &TT = Mods[0]->getTargetTriple(); + return Builder(Symtab, StrtabBuilder, Alloc, TT).build(Mods); } // Upgrade a vector of bitcode modules created by an old version of LLVM by diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp index b6318bb..d818993 100644 --- a/llvm/lib/Object/RelocationResolver.cpp +++ b/llvm/lib/Object/RelocationResolver.cpp @@ -812,6 +812,7 @@ getRelocationResolver(const ObjectFile &Obj) { case Triple::amdgcn: return {supportsAmdgpu, resolveAmdgpu}; case Triple::riscv64: + case Triple::riscv64be: return {supportsRISCV, resolveRISCV}; default: if (isAMDGPU(Obj)) @@ -851,6 +852,7 @@ getRelocationResolver(const ObjectFile &Obj) { case Triple::r600: return {supportsAmdgpu, resolveAmdgpu}; case Triple::riscv32: + case Triple::riscv32be: return {supportsRISCV, resolveRISCV}; case Triple::csky: return {supportsCSKY, resolveCSKY}; @@ -897,7 +899,9 @@ uint64_t resolveRelocation(RelocationResolver Resolver, const RelocationRef &R, if (Obj->getArch() != Triple::loongarch32 && Obj->getArch() != Triple::loongarch64 && Obj->getArch() != Triple::riscv32 && - Obj->getArch() != Triple::riscv64) + Obj->getArch() != Triple::riscv64 && + Obj->getArch() != Triple::riscv32be && + Obj->getArch() != Triple::riscv64be) LocData = 0; } } diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index 0623e66..f165e85 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -1078,9 +1078,13 @@ void OptPassGateInstrumentation::registerCallbacks( if (!PassGate.isEnabled()) return; - PIC.registerShouldRunOptionalPassCallback([this](StringRef PassName, Any IR) { - return this->shouldRun(PassName, IR); - }); + PIC.registerShouldRunOptionalPassCallback( + [this, &PIC](StringRef ClassName, Any IR) { + StringRef PassName = PIC.getPassNameForClassName(ClassName); + if (PassName.empty()) + return this->shouldRun(ClassName, IR); + return this->shouldRun(PassName, IR); + }); } raw_ostream &PrintPassInstrumentation::print() { diff --git a/llvm/lib/Support/BLAKE3/blake3_dispatch.c b/llvm/lib/Support/BLAKE3/blake3_dispatch.c index d00580f..19918aa 100644 --- a/llvm/lib/Support/BLAKE3/blake3_dispatch.c +++ b/llvm/lib/Support/BLAKE3/blake3_dispatch.c @@ -236,7 +236,7 @@ void blake3_xof_many(const uint32_t cv[8], #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); MAYBE_UNUSED(features); -#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512) +#if !defined(_WIN32) && !defined(__CYGWIN__) && !defined(BLAKE3_NO_AVX512) if (features & AVX512VL) { blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks); return; diff --git a/llvm/lib/Support/BLAKE3/blake3_impl.h b/llvm/lib/Support/BLAKE3/blake3_impl.h index deed079..dd71e72 100644 --- a/llvm/lib/Support/BLAKE3/blake3_impl.h +++ b/llvm/lib/Support/BLAKE3/blake3_impl.h @@ -324,7 +324,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); -#if !defined(_WIN32) +#if !defined(_WIN32) && !defined(__CYGWIN__) LLVM_LIBRARY_VISIBILITY void blake3_xof_many_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index c8e020d..aea1bb0 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -174,129 +174,174 @@ int TGLexer::peekNextChar(int Index) const { } tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) { - TokStart = CurPtr; - // This always consumes at least one character. - int CurChar = getNextChar(); + while (true) { + TokStart = CurPtr; + // This always consumes at least one character. + int CurChar = getNextChar(); - switch (CurChar) { - default: - // Handle letters: [a-zA-Z_] - if (isValidIDChar(CurChar, /*First=*/true)) - return LexIdentifier(); - - // Unknown character, emit an error. - return ReturnError(TokStart, "unexpected character"); - case EOF: - // Lex next token, if we just left an include file. - // Note that leaving an include file means that the next - // symbol is located at the end of the 'include "..."' - // construct, so LexToken() is called with default - // false parameter. - if (processEOF()) - return LexToken(); + switch (CurChar) { + default: + // Handle letters: [a-zA-Z_] + if (isValidIDChar(CurChar, /*First=*/true)) + return LexIdentifier(); - // Return EOF denoting the end of lexing. - return tgtok::Eof; - - case ':': return tgtok::colon; - case ';': return tgtok::semi; - case ',': return tgtok::comma; - case '<': return tgtok::less; - case '>': return tgtok::greater; - case ']': return tgtok::r_square; - case '{': return tgtok::l_brace; - case '}': return tgtok::r_brace; - case '(': return tgtok::l_paren; - case ')': return tgtok::r_paren; - case '=': return tgtok::equal; - case '?': return tgtok::question; - case '#': - if (FileOrLineStart) { - tgtok::TokKind Kind = prepIsDirective(); - if (Kind != tgtok::Error) - return lexPreprocessor(Kind); - } + // Unknown character, emit an error. + return ReturnError(TokStart, "unexpected character"); + case EOF: + // Lex next token, if we just left an include file. + if (processEOF()) { + // Leaving an include file means that the next symbol is located at the + // end of the 'include "..."' construct. + FileOrLineStart = false; + break; + } - return tgtok::paste; + // Return EOF denoting the end of lexing. + return tgtok::Eof; + + case ':': + return tgtok::colon; + case ';': + return tgtok::semi; + case ',': + return tgtok::comma; + case '<': + return tgtok::less; + case '>': + return tgtok::greater; + case ']': + return tgtok::r_square; + case '{': + return tgtok::l_brace; + case '}': + return tgtok::r_brace; + case '(': + return tgtok::l_paren; + case ')': + return tgtok::r_paren; + case '=': + return tgtok::equal; + case '?': + return tgtok::question; + case '#': + if (FileOrLineStart) { + tgtok::TokKind Kind = prepIsDirective(); + if (Kind != tgtok::Error) + return lexPreprocessor(Kind); + } + + return tgtok::paste; - // The period is a separate case so we can recognize the "..." - // range punctuator. - case '.': - if (peekNextChar(0) == '.') { - ++CurPtr; // Eat second dot. + // The period is a separate case so we can recognize the "..." + // range punctuator. + case '.': if (peekNextChar(0) == '.') { - ++CurPtr; // Eat third dot. - return tgtok::dotdotdot; + ++CurPtr; // Eat second dot. + if (peekNextChar(0) == '.') { + ++CurPtr; // Eat third dot. + return tgtok::dotdotdot; + } + return ReturnError(TokStart, "invalid '..' punctuation"); } - return ReturnError(TokStart, "invalid '..' punctuation"); - } - return tgtok::dot; + return tgtok::dot; - case '\r': - llvm_unreachable("getNextChar() must never return '\r'"); + case '\r': + llvm_unreachable("getNextChar() must never return '\r'"); - case ' ': - case '\t': - // Ignore whitespace. - return LexToken(FileOrLineStart); - case '\n': - // Ignore whitespace, and identify the new line. - return LexToken(true); - case '/': - // If this is the start of a // comment, skip until the end of the line or - // the end of the buffer. - if (*CurPtr == '/') - SkipBCPLComment(); - else if (*CurPtr == '*') { - if (SkipCComment()) - return tgtok::Error; - } else // Otherwise, this is an error. - return ReturnError(TokStart, "unexpected character"); - return LexToken(FileOrLineStart); - case '-': case '+': - case '0': case '1': case '2': case '3': case '4': case '5': case '6': - case '7': case '8': case '9': { - int NextChar = 0; - if (isDigit(CurChar)) { - // Allow identifiers to start with a number if it is followed by - // an identifier. This can happen with paste operations like - // foo#8i. - int i = 0; - do { - NextChar = peekNextChar(i++); - } while (isDigit(NextChar)); - - if (NextChar == 'x' || NextChar == 'b') { - // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most - // likely a number. - int NextNextChar = peekNextChar(i); - switch (NextNextChar) { - default: - break; - case '0': case '1': - if (NextChar == 'b') - return LexNumber(); - [[fallthrough]]; - case '2': case '3': case '4': case '5': - case '6': case '7': case '8': case '9': - case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - if (NextChar == 'x') - return LexNumber(); - break; + case ' ': + case '\t': + // Ignore whitespace. + break; + case '\n': + // Ignore whitespace, and identify the new line. + FileOrLineStart = true; + break; + case '/': + // If this is the start of a // comment, skip until the end of the line or + // the end of the buffer. + if (*CurPtr == '/') + SkipBCPLComment(); + else if (*CurPtr == '*') { + if (SkipCComment()) + return tgtok::Error; + } else // Otherwise, this is an error. + return ReturnError(TokStart, "unexpected character"); + break; + case '-': + case '+': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + int NextChar = 0; + if (isDigit(CurChar)) { + // Allow identifiers to start with a number if it is followed by + // an identifier. This can happen with paste operations like + // foo#8i. + int i = 0; + do { + NextChar = peekNextChar(i++); + } while (isDigit(NextChar)); + + if (NextChar == 'x' || NextChar == 'b') { + // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most + // likely a number. + int NextNextChar = peekNextChar(i); + switch (NextNextChar) { + default: + break; + case '0': + case '1': + if (NextChar == 'b') + return LexNumber(); + [[fallthrough]]; + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + if (NextChar == 'x') + return LexNumber(); + break; + } } } - } - if (isValidIDChar(NextChar, /*First=*/true)) - return LexIdentifier(); + if (isValidIDChar(NextChar, /*First=*/true)) + return LexIdentifier(); - return LexNumber(); - } - case '"': return LexString(); - case '$': return LexVarName(); - case '[': return LexBracket(); - case '!': return LexExclaim(); + return LexNumber(); + } + case '"': + return LexString(); + case '$': + return LexVarName(); + case '[': + return LexBracket(); + case '!': + return LexExclaim(); + } } } diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 7de66cc..12fc976 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -598,6 +598,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( llvm_unreachable("Unsupported ElementSize"); } + // Preserve undef state until DOP's reg is defined. + unsigned DOPRegState = MI.getOperand(DOPIdx).isUndef() ? RegState::Undef : 0; + // // Create the destructive operation (if required) // @@ -616,10 +619,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero)) .addReg(DstReg, RegState::Define) .addReg(MI.getOperand(PredIdx).getReg()) - .addReg(MI.getOperand(DOPIdx).getReg()); + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState); // After the movprfx, the destructive operand is same as Dst DOPIdx = 0; + DOPRegState = 0; // Create the additional LSL to zero the lanes when the DstReg is not // unique. Zeros the lanes in z0 that aren't active in p0 with sequence @@ -638,8 +642,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( assert(DOPRegIsUnique && "The destructive operand should be unique"); PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx)) .addReg(DstReg, RegState::Define) - .addReg(MI.getOperand(DOPIdx).getReg()); + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState); DOPIdx = 0; + DOPRegState = 0; } // @@ -647,10 +652,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( // DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode)) .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)); + DOPRegState = DOPRegState | RegState::Kill; switch (DType) { case AArch64::DestructiveUnaryPassthru: - DOP.addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) + DOP.addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState) .add(MI.getOperand(PredIdx)) .add(MI.getOperand(SrcIdx)); break; @@ -659,12 +665,12 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( case AArch64::DestructiveBinaryComm: case AArch64::DestructiveBinaryCommWithRev: DOP.add(MI.getOperand(PredIdx)) - .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) - .add(MI.getOperand(SrcIdx)); + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState) + .add(MI.getOperand(SrcIdx)); break; case AArch64::DestructiveTernaryCommWithRev: DOP.add(MI.getOperand(PredIdx)) - .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState) .add(MI.getOperand(SrcIdx)) .add(MI.getOperand(Src2Idx)); break; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 4f13a14..d04e6c4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17155,7 +17155,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -17163,6 +17163,11 @@ bool AArch64TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast<LoadInst>(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + const DataLayout &DL = LI->getDataLayout(); VectorType *VTy = Shuffles[0]->getType(); @@ -17486,9 +17491,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, } bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( - Instruction *Load, Value *Mask, - ArrayRef<Value *> DeinterleavedValues) const { - unsigned Factor = DeinterleavedValues.size(); + Instruction *Load, Value *Mask, IntrinsicInst *DI) const { + const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); if (Factor != 2 && Factor != 4) { LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n"); return false; @@ -17498,9 +17502,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( return false; assert(!Mask && "Unexpected mask on a load\n"); - Value *FirstActive = *llvm::find_if(DeinterleavedValues, - [](Value *V) { return V != nullptr; }); - VectorType *VTy = cast<VectorType>(FirstActive->getType()); + VectorType *VTy = getDeinterleavedVectorType(DI); const DataLayout &DL = LI->getModule()->getDataLayout(); bool UseScalable; @@ -17528,6 +17530,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue()); Value *BaseAddr = LI->getPointerOperand(); + Value *Result = nullptr; if (NumLoads > 1) { // Create multiple legal small ldN. SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy)); @@ -17548,35 +17551,35 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( } LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump()); } - // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 - for (unsigned J = 0; J < Factor; ++J) { - if (DeinterleavedValues[J]) - DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]); - } + + // Merge the values from different factors. + Result = PoisonValue::get(DI->getType()); + for (unsigned J = 0; J < Factor; ++J) + Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J); } else { - Value *Result; if (UseScalable) Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN"); else Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); - // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 - for (unsigned I = 0; I < Factor; I++) { - if (DeinterleavedValues[I]) { - Value *NewExtract = Builder.CreateExtractValue(Result, I); - DeinterleavedValues[I]->replaceAllUsesWith(NewExtract); - } - } } + + // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 + DI->replaceAllUsesWith(Result); return true; } bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef<Value *> InterleavedValues) const { + Instruction *Store, Value *Mask, + ArrayRef<Value *> InterleavedValues) const { unsigned Factor = InterleavedValues.size(); if (Factor != 2 && Factor != 4) { LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n"); return false; } + StoreInst *SI = dyn_cast<StoreInst>(Store); + if (!SI) + return false; + assert(!Mask && "Unexpected mask on plain store"); VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType()); const DataLayout &DL = SI->getModule()->getDataLayout(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 6afb3c3..713793e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -211,19 +211,19 @@ public: unsigned getMaxSupportedInterleaveFactor() const override { return 4; } - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool lowerDeinterleaveIntrinsicToLoad( - Instruction *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveValues) const override; + bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, + IntrinsicInst *DI) const override; bool lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override; + Instruction *Store, Value *Mask, + ArrayRef<Value *> InterleaveValues) const override; bool isLegalAddImmediate(int64_t) const override; bool isLegalAddScalableImmediate(int64_t) const override; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 996b0ed..bc57537 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -20,6 +20,7 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -35,6 +36,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -7351,6 +7353,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const { case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return true; } // end switch (Pattern) return false; @@ -7391,11 +7396,252 @@ static bool getMiscPatterns(MachineInstr &Root, return false; } +static bool getGatherPattern(MachineInstr &Root, + SmallVectorImpl<unsigned> &Patterns, + unsigned LoadLaneOpCode, unsigned NumLanes) { + const MachineFunction *MF = Root.getMF(); + + // Early exit if optimizing for size. + if (MF->getFunction().hasMinSize()) + return false; + + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + + // The root of the pattern must load into the last lane of the vector. + if (Root.getOperand(2).getImm() != NumLanes - 1) + return false; + + // Check that we have load into all lanes except lane 0. + // For each load we also want to check that: + // 1. It has a single non-debug use (since we will be replacing the virtual + // register) + // 2. That the addressing mode only uses a single offset register. + auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + auto Range = llvm::seq<unsigned>(1, NumLanes - 1); + SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end()); + while (!RemainingLanes.empty() && CurrInstr && + CurrInstr->getOpcode() == LoadLaneOpCode && + MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) && + CurrInstr->getNumOperands() == 4) { + RemainingLanes.erase(CurrInstr->getOperand(2).getImm()); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + if (!RemainingLanes.empty()) + return false; + + // Match the SUBREG_TO_REG sequence. + if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG) + return false; + + // Verify that the subreg to reg loads an integer into the first lane. + auto Lane0LoadReg = CurrInstr->getOperand(2).getReg(); + unsigned SingleLaneSizeInBits = 128 / NumLanes; + if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits) + return false; + + // Verify that it also has a single non debug use. + if (!MRI.hasOneNonDBGUse(Lane0LoadReg)) + return false; + + switch (NumLanes) { + case 4: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32); + break; + case 8: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16); + break; + case 16: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8); + break; + default: + llvm_unreachable("Got bad number of lanes for gather pattern."); + } + + return true; +} + +/// Search for patterns where we use LD1 instructions to load into +/// separate lanes of an 128 bit Neon register. We can increase Memory Level +/// Parallelism by loading into 2 Neon registers instead. +static bool getLoadPatterns(MachineInstr &Root, + SmallVectorImpl<unsigned> &Patterns) { + + // The pattern searches for loads into single lanes. + switch (Root.getOpcode()) { + case AArch64::LD1i32: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 4); + case AArch64::LD1i16: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 8); + case AArch64::LD1i8: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 16); + default: + return false; + } +} + +static void +generateGatherPattern(MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<Register, unsigned> &InstrIdxForVirtReg, + unsigned Pattern, unsigned NumLanes) { + + MachineFunction &MF = *Root.getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + // Gather the initial load instructions to build the pattern + SmallVector<MachineInstr *, 16> LoadToLaneInstrs; + MachineInstr *CurrInstr = &Root; + for (unsigned i = 0; i < NumLanes - 1; ++i) { + LoadToLaneInstrs.push_back(CurrInstr); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + // Sort the load instructions according to the lane. + llvm::sort(LoadToLaneInstrs, + [](const MachineInstr *A, const MachineInstr *B) { + return A->getOperand(2).getImm() > B->getOperand(2).getImm(); + }); + + MachineInstr *SubregToReg = CurrInstr; + LoadToLaneInstrs.push_back( + MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg())); + auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs); + + const TargetRegisterClass *FPR128RegClass = + MRI.getRegClass(Root.getOperand(0).getReg()); + + auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr, + Register SrcRegister, unsigned Lane, + Register OffsetRegister) { + auto NewRegister = MRI.createVirtualRegister(FPR128RegClass); + MachineInstrBuilder LoadIndexIntoRegister = + BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()), + NewRegister) + .addReg(SrcRegister) + .addImm(Lane) + .addReg(OffsetRegister, getKillRegState(true)); + InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size())); + InsInstrs.push_back(LoadIndexIntoRegister); + return NewRegister; + }; + + // Helper to create load instruction based on opcode + auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg, + Register OffsetReg) -> MachineInstrBuilder { + unsigned Opcode; + switch (NumLanes) { + case 4: + Opcode = AArch64::LDRSui; + break; + case 8: + Opcode = AArch64::LDRHui; + break; + case 16: + Opcode = AArch64::LDRBui; + break; + default: + llvm_unreachable( + "Got unsupported number of lanes in machine-combiner gather pattern"); + } + // Immediate offset load + return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg) + .addReg(OffsetReg) + .addImm(0); // immediate offset + }; + + // Load the remaining lanes into register 0. + auto LanesToLoadToReg0 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + 1, + LoadToLaneInstrsAscending.begin() + NumLanes / 2); + auto PrevReg = SubregToReg->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) { + PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, + LoadInstr->getOperand(3).getReg()); + DelInstrs.push_back(LoadInstr); + } + auto LastLoadReg0 = PrevReg; + + // First load into register 1. Perform a LDRSui to zero out the upper lanes in + // a single instruction. + auto Lane0Load = *LoadToLaneInstrsAscending.begin(); + auto OriginalSplitLoad = + *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2); + auto DestRegForMiddleIndex = MRI.createVirtualRegister( + MRI.getRegClass(Lane0Load->getOperand(0).getReg())); + + MachineInstrBuilder MiddleIndexLoadInstr = + CreateLoadInstruction(NumLanes, DestRegForMiddleIndex, + OriginalSplitLoad->getOperand(3).getReg()); + + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForMiddleIndex, InsInstrs.size())); + InsInstrs.push_back(MiddleIndexLoadInstr); + DelInstrs.push_back(OriginalSplitLoad); + + // Subreg To Reg instruction for register 1. + auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass); + unsigned SubregType; + switch (NumLanes) { + case 4: + SubregType = AArch64::ssub; + break; + case 8: + SubregType = AArch64::hsub; + break; + case 16: + SubregType = AArch64::bsub; + break; + default: + llvm_unreachable( + "Got invalid NumLanes for machine-combiner gather pattern"); + } + + auto SubRegToRegInstr = + BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()), + DestRegForSubregToReg) + .addImm(0) + .addReg(DestRegForMiddleIndex, getKillRegState(true)) + .addImm(SubregType); + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForSubregToReg, InsInstrs.size())); + InsInstrs.push_back(SubRegToRegInstr); + + // Load remaining lanes into register 1. + auto LanesToLoadToReg1 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, + LoadToLaneInstrsAscending.end()); + PrevReg = SubRegToRegInstr->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) { + PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, + LoadInstr->getOperand(3).getReg()); + if (Index == NumLanes / 2 - 2) { + break; + } + DelInstrs.push_back(LoadInstr); + } + auto LastLoadReg1 = PrevReg; + + // Create the final zip instruction to combine the results. + MachineInstrBuilder ZipInstr = + BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64), + Root.getOperand(0).getReg()) + .addReg(LastLoadReg0) + .addReg(LastLoadReg1); + InsInstrs.push_back(ZipInstr); +} + CombinerObjective AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const { switch (Pattern) { case AArch64MachineCombinerPattern::SUBADD_OP1: case AArch64MachineCombinerPattern::SUBADD_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return CombinerObjective::MustReduceDepth; default: return TargetInstrInfo::getCombinerObjective(Pattern); @@ -7425,6 +7671,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( if (getMiscPatterns(Root, Patterns)) return true; + // Load patterns + if (getLoadPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce); } @@ -8680,6 +8930,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); break; } + case AArch64MachineCombinerPattern::GATHER_LANE_i32: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 4); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i16: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 8); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i8: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 16); + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 7c255da..02734866 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned { FMULv8i16_indexed_OP2, FNMADD, + + GATHER_LANE_i32, + GATHER_LANE_i16, + GATHER_LANE_i8 }; class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 233f42b..08f547a 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -559,8 +559,7 @@ void AArch64TargetELFStreamer::finish() { if (!Sym.isMemtag()) continue; auto *SRE = MCSymbolRefExpr::create(&Sym, Ctx); - (void)S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE, SMLoc(), - *Ctx.getSubtargetInfo()); + S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index b2b2b37..0e0e83b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -89,6 +89,12 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch", "Use scratch_* flat memory instructions to access scratch" >; +def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode", + "FlatGVSMode", + "true", + "Have GVS addressing mode with flat_* instructions" +>; + def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", "AddNoCarryInsts", "true", @@ -1112,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts", "Has v_bitop3_b32/v_bitop3_b16 instructions" >; +def FeatureTanhInsts : SubtargetFeature<"tanh-insts", + "HasTanhInsts", + "true", + "Has v_tanh_f32/f16 instructions" +>; + def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts", "HasTransposeLoadF4F6Insts", "true", @@ -1954,6 +1966,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureShaderCyclesHiLoRegisters, FeatureArchitectedFlatScratch, FeatureArchitectedSGPRs, + FeatureFlatGVSMode, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, FeatureAtomicDsPkAdd16Insts, @@ -1972,6 +1985,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureScalarDwordx3Loads, FeatureDPPSrc1SGPR, FeatureBitOp3Insts, + FeatureTanhInsts, FeatureTransposeLoadF4F6Insts, FeatureBF16TransInsts, FeatureBF16ConversionInsts, @@ -2381,6 +2395,9 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">, def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">, AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>; +def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">, + AssemblerPredicate<(all_of FeatureFlatGVSMode)>; + def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">, AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>; @@ -2693,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">, def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">, AssemblerPredicate<(all_of FeatureBitOp3Insts)>; +def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">, + AssemblerPredicate<(all_of FeatureTanhInsts)>; + def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">, AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 280f87b..3d040fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4843,11 +4843,94 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } +// Detect when CMP and SELECT use the same constant and fold them to avoid +// loading the constant twice. Specifically handles patterns like: +// %cmp = icmp eq i32 %val, 4242 +// %sel = select i1 %cmp, i32 4242, i32 %other +// It can be optimized to reuse %val instead of 4242 in select. +static SDValue +foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const AMDGPUSubtarget *ST) { + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Check if condition is a comparison. + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); + bool isInteger = LHS.getValueType().isInteger(); + + // Handle simple floating-point and integer types only. + if (!isFloatingPoint && !isInteger) + return SDValue(); + + bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); + bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); + if (!isEquality && !isNonEquality) + return SDValue(); + + SDValue ArgVal, ConstVal; + if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || + (isInteger && isa<ConstantSDNode>(RHS))) { + ConstVal = RHS; + ArgVal = LHS; + } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || + (isInteger && isa<ConstantSDNode>(LHS))) { + ConstVal = LHS; + ArgVal = RHS; + } else { + return SDValue(); + } + + // Check if constant should not be optimized - early return if not. + if (isFloatingPoint) { + const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); + const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST); + + // Only optimize normal floating-point values (finite, non-zero, and + // non-subnormal as per IEEE 754), skip optimization for inlinable + // floating-point constants. + if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val)) + return SDValue(); + } else { + int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue(); + + // Skip optimization for inlinable integer immediates. + // Inlinable immediates include: -16 to 64 (inclusive). + if (IntVal >= -16 && IntVal <= 64) + return SDValue(); + } + + // For equality and non-equality comparisons, patterns: + // select (setcc x, const), const, y -> select (setcc x, const), x, y + // select (setccinv x, const), y, const -> select (setccinv x, const), y, x + if (!(isEquality && TrueVal == ConstVal) && + !(isNonEquality && FalseVal == ConstVal)) + return SDValue(); + + SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; + SDValue SelectRHS = + (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; + return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, + SelectLHS, SelectRHS); +} + SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) return Folded; + // Try to fold CMP + SELECT patterns with shared constants (both FP and + // integer). + if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget)) + return Folded; + SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp index 44eaebf..9a90787 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -25,6 +25,7 @@ namespace { class AMDGPUInsertDelayAlu { public: + const GCNSubtarget *ST; const SIInstrInfo *SII; const TargetRegisterInfo *TRI; @@ -65,13 +66,16 @@ public: // Types of delay that can be encoded in an s_delay_alu instruction. enum DelayType { VALU, TRANS, SALU, OTHER }; - // Get the delay type for an instruction with the specified TSFlags. - static DelayType getDelayType(uint64_t TSFlags) { - if (TSFlags & SIInstrFlags::TRANS) + // Get the delay type for a MachineInstr. + DelayType getDelayType(const MachineInstr &MI) { + if (SIInstrInfo::isTRANS(MI)) return TRANS; - if (TSFlags & SIInstrFlags::VALU) + // WMMA XDL ops are treated the same as TRANS. + if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI)) + return TRANS; + if (SIInstrInfo::isVALU(MI)) return VALU; - if (TSFlags & SIInstrFlags::SALU) + if (SIInstrInfo::isSALU(MI)) return SALU; return OTHER; } @@ -368,7 +372,7 @@ public: continue; } - DelayType Type = getDelayType(MI.getDesc().TSFlags); + DelayType Type = getDelayType(MI); if (instructionWaitsForSGPRWrites(MI)) { auto It = State.find(LastSGPRFromVALU); @@ -456,12 +460,12 @@ public: LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() << "\n"); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (!ST.hasDelayAlu()) + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!ST->hasDelayAlu()) return false; - SII = ST.getInstrInfo(); - TRI = ST.getRegisterInfo(); + SII = ST->getInstrInfo(); + TRI = ST->getRegisterInfo(); SchedModel = &SII->getSchedModel(); // Calculate the delay state for each basic block, iterating until we reach diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f4dc4a4..31a80e0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -2284,6 +2284,12 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const { Base::addPostRegAlloc(addPass); } +void AMDGPUCodeGenPassBuilder::addPreSched2(AddMachinePass &addPass) const { + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(SIShrinkInstructionsPass()); + addPass(SIPostRABundlerPass()); +} + void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) { addPass(GCNCreateVOPDPass()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 3c62cd1..3b2f39c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -183,6 +183,7 @@ public: void addPreEmitPass(AddMachinePass &) const; Error addRegAssignmentOptimized(AddMachinePass &) const; void addOptimizedRegAlloc(AddMachinePass &) const; + void addPreSched2(AddMachinePass &) const; /// Check if a pass is enabled given \p Opt option. The option always /// overrides defaults if explicitly used. Otherwise its default will be used diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 3625db9..c8a4e22 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -200,6 +200,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : let Inst{95-72} = !if(ps.has_offset, offset, ?); } +// TODO: Rename to FlatSaddrTable, it now handles both global and flat GVS addressing mode. class GlobalSaddrTable <bit is_saddr, string Name = ""> { bit IsSaddr = is_saddr; string SaddrOp = Name; @@ -237,10 +238,18 @@ class FLAT_Load_Pseudo< let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); } -multiclass FLAT_Load_Pseudo_t16<string opName> { - def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>; +multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { + def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput>, + GlobalSaddrTable<0, opName>; + let OtherPredicates = [HasFlatGVSMode] in + def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>, + GlobalSaddrTable<1, opName>; +} + +multiclass FLAT_Flat_Load_Pseudo_t16<string opName> { + defm "" : FLAT_Flat_Load_Pseudo<opName, VGPR_32, 1>; let True16Predicate = UseRealTrue16Insts in - def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>; + defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>; } class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, @@ -260,10 +269,26 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, let enabled_saddr = EnableSaddr; } -multiclass FLAT_Store_Pseudo_t16<string opName> { - def "" : FLAT_Store_Pseudo<opName, VGPR_32>; - let OtherPredicates = [HasTrue16BitInsts] in - def _t16 : FLAT_Store_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>; +multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> { + def "" : FLAT_Store_Pseudo<opName, regClass>, + GlobalSaddrTable<0, opName>; + let OtherPredicates = [HasFlatGVSMode] in + def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>, + GlobalSaddrTable<1, opName>; +} + +multiclass FLAT_Flat_Store_Pseudo_t16<string opName> { + defm "" : FLAT_Flat_Store_Pseudo<opName, VGPR_32>; + + defvar Name16 = opName#"_t16"; + let OtherPredicates = [HasFlatGVSMode, HasTrue16BitInsts] in { + def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>, + GlobalSaddrTable<0, Name16>, + True16D16Table<NAME#"_D16_HI", NAME>; + def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>, + GlobalSaddrTable<1, Name16>, + True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">; + } } multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { @@ -657,6 +682,18 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN< let FPAtomic = data_vt.isFP; let AddedComplexity = -1; // Prefer global atomics if available } + + def _SADDR : FLAT_AtomicNoRet_Pseudo <opName, + (outs), + (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol), + " $vaddr, $vdata, $saddr$offset$cpol">, + GlobalSaddrTable<1, opName> { + let OtherPredicates = [HasFlatGVSMode]; + let has_saddr = 1; + let enabled_saddr = 1; + let FPAtomic = data_vt.isFP; + let AddedComplexity = -1; // Prefer global atomics if available + } } multiclass FLAT_Atomic_Pseudo_RTN< @@ -665,15 +702,29 @@ multiclass FLAT_Atomic_Pseudo_RTN< ValueType vt, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { + RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret, + RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> { def _RTN : FLAT_AtomicRet_Pseudo <opName, - (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst), + (outs vdst_op:$vdst), (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn"> { let FPAtomic = data_vt.isFP; let AddedComplexity = -1; // Prefer global atomics if available } + + def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName, + (outs vdst_op:$vdst), + (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol), + " $vdst, $vaddr, $vdata, $saddr$offset$cpol">, + GlobalSaddrTable<1, opName#"_rtn"> { + let OtherPredicates = [HasFlatGVSMode]; + let has_saddr = 1; + let enabled_saddr = 1; + let PseudoInstr = NAME#"_SADDR_RTN"; + let FPAtomic = data_vt.isFP; + let AddedComplexity = -1; // Prefer global atomics if available + } } multiclass FLAT_Atomic_Pseudo< @@ -762,36 +813,36 @@ multiclass FLAT_Global_Atomic_Pseudo< // Flat Instructions //===----------------------------------------------------------------------===// -def FLAT_LOAD_UBYTE : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>; -def FLAT_LOAD_SBYTE : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>; -def FLAT_LOAD_USHORT : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>; -def FLAT_LOAD_SSHORT : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>; -def FLAT_LOAD_DWORD : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>; -def FLAT_LOAD_DWORDX2 : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>; -def FLAT_LOAD_DWORDX4 : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>; -def FLAT_LOAD_DWORDX3 : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>; +defm FLAT_LOAD_UBYTE : FLAT_Flat_Load_Pseudo <"flat_load_ubyte", VGPR_32>; +defm FLAT_LOAD_SBYTE : FLAT_Flat_Load_Pseudo <"flat_load_sbyte", VGPR_32>; +defm FLAT_LOAD_USHORT : FLAT_Flat_Load_Pseudo <"flat_load_ushort", VGPR_32>; +defm FLAT_LOAD_SSHORT : FLAT_Flat_Load_Pseudo <"flat_load_sshort", VGPR_32>; +defm FLAT_LOAD_DWORD : FLAT_Flat_Load_Pseudo <"flat_load_dword", VGPR_32>; +defm FLAT_LOAD_DWORDX2 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", VReg_64>; +defm FLAT_LOAD_DWORDX4 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", VReg_128>; +defm FLAT_LOAD_DWORDX3 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", VReg_96>; -def FLAT_STORE_DWORD : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>; -def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>; -def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>; -def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>; +defm FLAT_STORE_DWORD : FLAT_Flat_Store_Pseudo <"flat_store_dword", VGPR_32>; +defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", VReg_64>; +defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", VReg_128>; +defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", VReg_96>; let SubtargetPredicate = HasD16LoadStore in { let TiedSourceNotRead = 1 in { -def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">; -def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">; -def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">; +defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_UBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_ubyte_d16">; +defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_SBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_sbyte_d16">; +defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_SHORT_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_short_d16">; } -def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; -def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; +defm FLAT_STORE_BYTE_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; +defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; } -defm FLAT_STORE_BYTE : FLAT_Store_Pseudo_t16 <"flat_store_byte">; -defm FLAT_STORE_SHORT : FLAT_Store_Pseudo_t16 <"flat_store_short">; +defm FLAT_STORE_BYTE : FLAT_Flat_Store_Pseudo_t16 <"flat_store_byte">; +defm FLAT_STORE_SHORT : FLAT_Flat_Store_Pseudo_t16 <"flat_store_short">; defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap", VGPR_32, i32, v2i32, VReg_64>; @@ -1200,6 +1251,16 @@ class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp (inst $saddr, $voffset, $offset, 0, $in) >; +class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)), + (inst $saddr, $voffset, $offset, (i32 0), $in) +>; + +class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), + (inst $saddr, $voffset, $offset, (i32 0)) +>; + class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), (inst $saddr, $voffset, $offset, (i32 0)) @@ -1210,13 +1271,13 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> (inst $vaddr, $offset) >; -class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < +class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), (inst $saddr, $voffset, $offset, 0) >; -class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, - ValueType vt> : GCNPat < +class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, + ValueType vt> : GCNPat < (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)), (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset) >; @@ -1394,7 +1455,7 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp let AddedComplexity = 10; } - def : GlobalLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1404,7 +1465,7 @@ multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Valu let AddedComplexity = 10; } - def : GlobalLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1425,7 +1486,7 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, let AddedComplexity = 10; } - def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1435,7 +1496,7 @@ multiclass GlobalFLATStorePats_D16_t16<string inst, SDPatternOperator node, Valu let AddedComplexity = 10; } - def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> { + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> { let AddedComplexity = 11; } } @@ -1568,80 +1629,129 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu } } +multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat <inst, node, vt>; + + def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat_D16 <inst, node, vt>; + + def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat_D16_t16 <inst, node, vt>; + + def : FlatLoadSaddrPat_D16_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatStorePat <inst, node, vt>; + + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatStorePat <!cast<FLAT_Pseudo>(!cast<string>(inst)#"_t16"), node, vt>; + + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR_t16"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + let OtherPredicates = [HasFlatAddressSpace] in { -def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i16>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, extloadi16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, zextloadi16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>; foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in let True16Predicate = p in { - def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>; - def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>; - def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>; - def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; - def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_USHORT, load_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>; + defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>; + defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>; + defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; + defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; } let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in { - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>; - def : FlatStorePat <FLAT_STORE_BYTE_t16, truncstorei8_flat, i16>; - def : FlatStorePat <FLAT_STORE_SHORT_t16, store_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>; + defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>; + defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>; def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>; def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>; } // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts -def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>; +defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>; -def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>; +defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>; foreach vt = Reg32Types.types in { -def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>; -def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>; +defm : FlatLoadPats <FLAT_LOAD_DWORD, load_flat, vt>; +defm : FlatStorePats <FLAT_STORE_DWORD, store_flat, vt>; } foreach vt = VReg_64.RegTypes in { -def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>; +defm : FlatStorePats <FLAT_STORE_DWORDX2, store_flat, vt>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX2, load_flat, vt>; } -def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32>; +defm : FlatStorePats <FLAT_STORE_DWORDX3, store_flat, v3i32>; foreach vt = VReg_128.RegTypes in { -def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>; -def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX4, load_flat, vt>; +defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>; } -def : FlatStorePat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>; -def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>; +defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>; +defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>; +defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>; +defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>; + foreach as = [ "flat", "global" ] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>; @@ -1684,6 +1794,9 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>; } // end foreach as +defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>; +defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>; + let SubtargetPredicate = isGFX12Plus in { defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >; @@ -1692,25 +1805,25 @@ let SubtargetPredicate = isGFX12Plus in { } let OtherPredicates = [HasD16LoadStore] in { -def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; -def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; } let OtherPredicates = [D16PreservesUnusedBits] in { // TODO: Handle atomic loads -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; } } // End OtherPredicates = [HasFlatAddressSpace] @@ -1782,6 +1895,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>; // appropriate waits. defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>; +defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>; @@ -1821,6 +1935,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>; +defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>; @@ -2832,14 +2947,7 @@ multiclass VFLAT_Real_Base_gfx12<bits<8> op, VFLAT_Aliases_gfx12<name, alias>, VFLAT_Real_gfx12<op, name>; -multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, - string name = get_FLAT_ps<NAME>.Mnemonic, - string alias = name> : - VFLAT_Real_Base_gfx12<op, name, alias> { - defm _RTN : VFLAT_Real_gfx12<op, name>; -} - -multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op, +multiclass VFLAT_Real_AllAddr_gfx12<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic, string alias = name> : VFLAT_Real_Base_gfx12<op, name, alias> { @@ -2853,7 +2961,7 @@ multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> { } } -multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op, +multiclass VFLAT_Real_AllAddr_gfx12_w64<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> : VFLAT_Aliases_gfx12<name> { let DecoderNamespace = "GFX12W64" in { @@ -2862,10 +2970,10 @@ multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op, } } -multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op, +multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic, string alias = name> : - VGLOBAL_Real_AllAddr_gfx12<op, name, alias> { + VFLAT_Real_AllAddr_gfx12<op, name, alias> { defm _RTN : VFLAT_Real_gfx12<op, name>; defm _SADDR_RTN : VFLAT_Real_gfx12<op, name>; } @@ -2879,28 +2987,28 @@ multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op, } // ENC_VFLAT. -defm FLAT_LOAD_UBYTE : VFLAT_Real_Base_gfx12<0x010, "flat_load_u8">; -defm FLAT_LOAD_SBYTE : VFLAT_Real_Base_gfx12<0x011, "flat_load_i8">; -defm FLAT_LOAD_USHORT : VFLAT_Real_Base_gfx12<0x012, "flat_load_u16">; -defm FLAT_LOAD_SSHORT : VFLAT_Real_Base_gfx12<0x013, "flat_load_i16">; -defm FLAT_LOAD_DWORD : VFLAT_Real_Base_gfx12<0x014, "flat_load_b32">; -defm FLAT_LOAD_DWORDX2 : VFLAT_Real_Base_gfx12<0x015, "flat_load_b64">; -defm FLAT_LOAD_DWORDX3 : VFLAT_Real_Base_gfx12<0x016, "flat_load_b96">; -defm FLAT_LOAD_DWORDX4 : VFLAT_Real_Base_gfx12<0x017, "flat_load_b128">; -defm FLAT_STORE_BYTE : VFLAT_Real_Base_gfx12<0x018, "flat_store_b8">; -defm FLAT_STORE_SHORT : VFLAT_Real_Base_gfx12<0x019, "flat_store_b16">; -defm FLAT_STORE_DWORD : VFLAT_Real_Base_gfx12<0x01a, "flat_store_b32">; -defm FLAT_STORE_DWORDX2 : VFLAT_Real_Base_gfx12<0x01b, "flat_store_b64">; -defm FLAT_STORE_DWORDX3 : VFLAT_Real_Base_gfx12<0x01c, "flat_store_b96">; -defm FLAT_STORE_DWORDX4 : VFLAT_Real_Base_gfx12<0x01d, "flat_store_b128">; -defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_Base_gfx12<0x01e, "flat_load_d16_u8">; -defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_Base_gfx12<0x01f, "flat_load_d16_i8">; -defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_Base_gfx12<0x020, "flat_load_d16_b16">; -defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x021, "flat_load_d16_hi_u8">; -defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x022, "flat_load_d16_hi_i8">; -defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x023, "flat_load_d16_hi_b16">; -defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_Base_gfx12<0x024, "flat_store_d16_hi_b8">; -defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x025, "flat_store_d16_hi_b16">; +defm FLAT_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "flat_load_u8">; +defm FLAT_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "flat_load_i8">; +defm FLAT_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "flat_load_u16">; +defm FLAT_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "flat_load_i16">; +defm FLAT_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "flat_load_b32">; +defm FLAT_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "flat_load_b64">; +defm FLAT_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "flat_load_b96">; +defm FLAT_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "flat_load_b128">; +defm FLAT_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "flat_store_b8">; +defm FLAT_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "flat_store_b16">; +defm FLAT_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "flat_store_b32">; +defm FLAT_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "flat_store_b64">; +defm FLAT_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "flat_store_b96">; +defm FLAT_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "flat_store_b128">; +defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "flat_load_d16_u8">; +defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "flat_load_d16_i8">; +defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "flat_load_d16_b16">; +defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "flat_load_d16_hi_u8">; +defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "flat_load_d16_hi_i8">; +defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "flat_load_d16_hi_b16">; +defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "flat_store_d16_hi_b8">; +defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "flat_store_d16_hi_b16">; defm FLAT_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "flat_atomic_swap_b32">; defm FLAT_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "flat_atomic_cmpswap_b32">; defm FLAT_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "flat_atomic_add_u32">; @@ -2936,74 +3044,74 @@ defm FLAT_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>; defm FLAT_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>; // ENC_VGLOBAL. -defm GLOBAL_LOAD_UBYTE : VGLOBAL_Real_AllAddr_gfx12<0x010, "global_load_u8">; -defm GLOBAL_LOAD_SBYTE : VGLOBAL_Real_AllAddr_gfx12<0x011, "global_load_i8">; -defm GLOBAL_LOAD_USHORT : VGLOBAL_Real_AllAddr_gfx12<0x012, "global_load_u16">; -defm GLOBAL_LOAD_SSHORT : VGLOBAL_Real_AllAddr_gfx12<0x013, "global_load_i16">; -defm GLOBAL_LOAD_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x014, "global_load_b32">; -defm GLOBAL_LOAD_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x015, "global_load_b64">; -defm GLOBAL_LOAD_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x016, "global_load_b96">; -defm GLOBAL_LOAD_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x017, "global_load_b128">; -defm GLOBAL_STORE_BYTE : VGLOBAL_Real_AllAddr_gfx12<0x018, "global_store_b8">; -defm GLOBAL_STORE_SHORT : VGLOBAL_Real_AllAddr_gfx12<0x019, "global_store_b16">; -defm GLOBAL_STORE_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x01a, "global_store_b32">; -defm GLOBAL_STORE_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x01b, "global_store_b64">; -defm GLOBAL_STORE_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x01c, "global_store_b96">; -defm GLOBAL_STORE_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x01d, "global_store_b128">; -defm GLOBAL_LOAD_UBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">; -defm GLOBAL_LOAD_SBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">; -defm GLOBAL_LOAD_SHORT_D16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">; -defm GLOBAL_LOAD_UBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">; -defm GLOBAL_LOAD_SBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">; -defm GLOBAL_LOAD_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">; -defm GLOBAL_STORE_BYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">; -defm GLOBAL_STORE_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">; -defm GLOBAL_LOAD_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">; -defm GLOBAL_STORE_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">; -defm GLOBAL_LOAD_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x053>; -defm GLOBAL_STORE_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x054>; - -defm GLOBAL_ATOMIC_SWAP : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">; -defm GLOBAL_ATOMIC_CMPSWAP : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">; -defm GLOBAL_ATOMIC_ADD : VGLOBAL_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">; -defm GLOBAL_ATOMIC_SUB : VGLOBAL_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">; -defm GLOBAL_ATOMIC_CSUB : VGLOBAL_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">; -defm GLOBAL_ATOMIC_SMIN : VGLOBAL_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">; -defm GLOBAL_ATOMIC_UMIN : VGLOBAL_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">; -defm GLOBAL_ATOMIC_SMAX : VGLOBAL_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">; -defm GLOBAL_ATOMIC_UMAX : VGLOBAL_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">; -defm GLOBAL_ATOMIC_AND : VGLOBAL_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">; -defm GLOBAL_ATOMIC_OR : VGLOBAL_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">; -defm GLOBAL_ATOMIC_XOR : VGLOBAL_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">; -defm GLOBAL_ATOMIC_INC : VGLOBAL_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">; -defm GLOBAL_ATOMIC_DEC : VGLOBAL_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">; -defm GLOBAL_ATOMIC_SWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">; -defm GLOBAL_ATOMIC_CMPSWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">; -defm GLOBAL_ATOMIC_ADD_X2 : VGLOBAL_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">; -defm GLOBAL_ATOMIC_SUB_X2 : VGLOBAL_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">; -defm GLOBAL_ATOMIC_SMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">; -defm GLOBAL_ATOMIC_UMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">; -defm GLOBAL_ATOMIC_SMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">; -defm GLOBAL_ATOMIC_UMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">; -defm GLOBAL_ATOMIC_AND_X2 : VGLOBAL_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">; -defm GLOBAL_ATOMIC_OR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">; -defm GLOBAL_ATOMIC_XOR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">; -defm GLOBAL_ATOMIC_INC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">; -defm GLOBAL_ATOMIC_DEC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">; -defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050>; -defm GLOBAL_ATOMIC_FMIN : VGLOBAL_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">; -defm GLOBAL_ATOMIC_FMAX : VGLOBAL_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">; -defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>; +defm GLOBAL_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "global_load_u8">; +defm GLOBAL_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "global_load_i8">; +defm GLOBAL_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "global_load_u16">; +defm GLOBAL_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "global_load_i16">; +defm GLOBAL_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "global_load_b32">; +defm GLOBAL_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "global_load_b64">; +defm GLOBAL_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "global_load_b96">; +defm GLOBAL_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "global_load_b128">; +defm GLOBAL_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "global_store_b8">; +defm GLOBAL_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "global_store_b16">; +defm GLOBAL_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "global_store_b32">; +defm GLOBAL_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "global_store_b64">; +defm GLOBAL_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "global_store_b96">; +defm GLOBAL_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "global_store_b128">; +defm GLOBAL_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">; +defm GLOBAL_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">; +defm GLOBAL_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">; +defm GLOBAL_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">; +defm GLOBAL_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">; +defm GLOBAL_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">; +defm GLOBAL_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">; +defm GLOBAL_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">; +defm GLOBAL_LOAD_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">; +defm GLOBAL_STORE_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">; +defm GLOBAL_LOAD_BLOCK : VFLAT_Real_AllAddr_gfx12<0x053>; +defm GLOBAL_STORE_BLOCK : VFLAT_Real_AllAddr_gfx12<0x054>; + +defm GLOBAL_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">; +defm GLOBAL_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">; +defm GLOBAL_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">; +defm GLOBAL_ATOMIC_SUB : VFLAT_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">; +defm GLOBAL_ATOMIC_CSUB : VFLAT_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">; +defm GLOBAL_ATOMIC_SMIN : VFLAT_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">; +defm GLOBAL_ATOMIC_UMIN : VFLAT_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">; +defm GLOBAL_ATOMIC_SMAX : VFLAT_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">; +defm GLOBAL_ATOMIC_UMAX : VFLAT_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">; +defm GLOBAL_ATOMIC_AND : VFLAT_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">; +defm GLOBAL_ATOMIC_OR : VFLAT_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">; +defm GLOBAL_ATOMIC_XOR : VFLAT_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">; +defm GLOBAL_ATOMIC_INC : VFLAT_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">; +defm GLOBAL_ATOMIC_DEC : VFLAT_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">; +defm GLOBAL_ATOMIC_SWAP_X2 : VFLAT_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">; +defm GLOBAL_ATOMIC_CMPSWAP_X2 : VFLAT_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">; +defm GLOBAL_ATOMIC_ADD_X2 : VFLAT_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">; +defm GLOBAL_ATOMIC_SUB_X2 : VFLAT_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">; +defm GLOBAL_ATOMIC_SMIN_X2 : VFLAT_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">; +defm GLOBAL_ATOMIC_UMIN_X2 : VFLAT_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">; +defm GLOBAL_ATOMIC_SMAX_X2 : VFLAT_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">; +defm GLOBAL_ATOMIC_UMAX_X2 : VFLAT_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">; +defm GLOBAL_ATOMIC_AND_X2 : VFLAT_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">; +defm GLOBAL_ATOMIC_OR_X2 : VFLAT_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">; +defm GLOBAL_ATOMIC_XOR_X2 : VFLAT_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">; +defm GLOBAL_ATOMIC_INC_X2 : VFLAT_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">; +defm GLOBAL_ATOMIC_DEC_X2 : VFLAT_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">; +defm GLOBAL_ATOMIC_COND_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x050>; +defm GLOBAL_ATOMIC_FMIN : VFLAT_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">; +defm GLOBAL_ATOMIC_FMAX : VFLAT_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">; +defm GLOBAL_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056>; defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x057>; defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x058>; -defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x057>; -defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x058>; +defm GLOBAL_LOAD_TR_B128_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x057>; +defm GLOBAL_LOAD_TR_B64_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x058>; -defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>; -defm GLOBAL_ATOMIC_PK_ADD_F16 : VGLOBAL_Real_Atomics_gfx12<0x059>; -defm GLOBAL_ATOMIC_PK_ADD_BF16 : VGLOBAL_Real_Atomics_gfx12<0x05a>; +defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VFLAT_Real_Atomics_gfx12<0x073>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>; +defm GLOBAL_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>; defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b>; defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c>; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index fce8f36..a655308 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -803,7 +803,8 @@ void GCNScheduleDAGMILive::schedule() { GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { GCNDownwardRPTracker RPTracker(*LIS); - RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); + RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second, + &LiveIns[RegionIdx]); return RPTracker.moveMaxPressure(); } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 6843052..268162b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -214,6 +214,7 @@ protected: bool FlatInstOffsets = false; bool FlatGlobalInsts = false; bool FlatScratchInsts = false; + bool FlatGVSMode = false; bool ScalarFlatScratchInsts = false; bool HasArchitectedFlatScratch = false; bool EnableFlatScratch = false; @@ -233,6 +234,7 @@ protected: bool HasRestrictedSOffset = false; bool Has64BitLiterals = false; bool HasBitOp3Insts = false; + bool HasTanhInsts = false; bool HasTransposeLoadF4F6Insts = false; bool HasPrngInst = false; bool HasBVHDualAndBVH8Insts = false; @@ -1160,6 +1162,8 @@ public: bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; } + bool hasFlatGVSMode() const { return FlatGVSMode; } + bool enableSIScheduler() const { return EnableSIScheduler; } @@ -1377,6 +1381,8 @@ public: return HasMinimum3Maximum3F16; } + bool hasTanhInsts() const { return HasTanhInsts; } + bool hasAddPC64Inst() const { return GFX1250Insts; } bool hasMinimum3Maximum3PKF16() const { diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 9b5a463..44d9ef5 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -378,6 +378,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, default: return false; case AMDGPU::V_MOV_B32_e32: + case AMDGPU::AV_MOV_B32_IMM_PSEUDO: SMovOp = AMDGPU::S_MOV_B32; break; case AMDGPU::V_MOV_B64_PSEUDO: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index dfe6f65..27212fda 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9308,7 +9308,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_reloc_constant: { - Module *M = const_cast<Module *>(MF.getFunction().getParent()); + Module *M = MF.getFunction().getParent(); const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD(); auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); auto *RelocSymbol = cast<GlobalVariable>( diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7ce1359..2af0a57 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -260,240 +260,7 @@ InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { llvm_unreachable("event type has no associated counter"); } -// This objects maintains the current score brackets of each wait counter, and -// a per-register scoreboard for each wait counter. -// -// We also maintain the latest score for every event type that can change the -// waitcnt in order to know if there are multiple types of events within -// the brackets. When multiple types of event happen in the bracket, -// wait count may get decreased out of order, therefore we need to put in -// "s_waitcnt 0" before use. -class WaitcntBrackets { -public: - WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, - HardwareLimits Limits, const unsigned *WaitEventMaskForInst, - InstCounterType SmemAccessCounter) - : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), - WaitEventMaskForInst(WaitEventMaskForInst), - SmemAccessCounter(SmemAccessCounter) {} - - unsigned getWaitCountMax(InstCounterType T) const { - switch (T) { - case LOAD_CNT: - return Limits.LoadcntMax; - case DS_CNT: - return Limits.DscntMax; - case EXP_CNT: - return Limits.ExpcntMax; - case STORE_CNT: - return Limits.StorecntMax; - case SAMPLE_CNT: - return Limits.SamplecntMax; - case BVH_CNT: - return Limits.BvhcntMax; - case KM_CNT: - return Limits.KmcntMax; - case X_CNT: - return Limits.XcntMax; - default: - break; - } - return 0; - } - - bool isSmemCounter(InstCounterType T) const { - return T == SmemAccessCounter || T == X_CNT; - } - - unsigned getSgprScoresIdx(InstCounterType T) const { - assert(isSmemCounter(T) && "Invalid SMEM counter"); - return T == X_CNT ? 1 : 0; - } - - unsigned getScoreLB(InstCounterType T) const { - assert(T < NUM_INST_CNTS); - return ScoreLBs[T]; - } - - unsigned getScoreUB(InstCounterType T) const { - assert(T < NUM_INST_CNTS); - return ScoreUBs[T]; - } - - unsigned getScoreRange(InstCounterType T) const { - return getScoreUB(T) - getScoreLB(T); - } - - unsigned getRegScore(int GprNo, InstCounterType T) const { - if (GprNo < NUM_ALL_VGPRS) - return VgprScores[T][GprNo]; - return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; - } - - bool merge(const WaitcntBrackets &Other); - - RegInterval getRegInterval(const MachineInstr *MI, - const MachineRegisterInfo *MRI, - const SIRegisterInfo *TRI, - const MachineOperand &Op) const; - - bool counterOutOfOrder(InstCounterType T) const; - void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; - void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; - - void determineWait(InstCounterType T, RegInterval Interval, - AMDGPU::Waitcnt &Wait) const; - void determineWait(InstCounterType T, int RegNo, - AMDGPU::Waitcnt &Wait) const { - determineWait(T, {RegNo, RegNo + 1}, Wait); - } - - void applyWaitcnt(const AMDGPU::Waitcnt &Wait); - void applyWaitcnt(InstCounterType T, unsigned Count); - void applyXcnt(const AMDGPU::Waitcnt &Wait); - void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, WaitEventType E, - MachineInstr &MI); - - unsigned hasPendingEvent() const { return PendingEvents; } - unsigned hasPendingEvent(WaitEventType E) const { - return PendingEvents & (1 << E); - } - unsigned hasPendingEvent(InstCounterType T) const { - unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; - assert((HasPending != 0) == (getScoreRange(T) != 0)); - return HasPending; - } - - bool hasMixedPendingEvents(InstCounterType T) const { - unsigned Events = hasPendingEvent(T); - // Return true if more than one bit is set in Events. - return Events & (Events - 1); - } - - bool hasPendingFlat() const { - return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && - LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || - (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && - LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); - } - - void setPendingFlat() { - LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; - LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; - } - - bool hasPendingGDS() const { - return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; - } - - unsigned getPendingGDSWait() const { - return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1); - } - - void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } - - // Return true if there might be pending writes to the vgpr-interval by VMEM - // instructions with types different from V. - bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - if (VgprVmemTypes[RegNo] & ~(1 << V)) - return true; - } - return false; - } - - void clearVgprVmemTypes(RegInterval Interval) { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - VgprVmemTypes[RegNo] = 0; - } - } - - void setStateOnFunctionEntryOrReturn() { - setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT)); - PendingEvents |= WaitEventMaskForInst[STORE_CNT]; - } - - ArrayRef<const MachineInstr *> getLDSDMAStores() const { - return LDSDMAStores; - } - - bool hasPointSampleAccel(const MachineInstr &MI) const; - bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, - RegInterval Interval) const; - - void print(raw_ostream &) const; - void dump() const { print(dbgs()); } - -private: - struct MergeInfo { - unsigned OldLB; - unsigned OtherLB; - unsigned MyShift; - unsigned OtherShift; - }; - static bool mergeScore(const MergeInfo &M, unsigned &Score, - unsigned OtherScore); - - void setScoreLB(InstCounterType T, unsigned Val) { - assert(T < NUM_INST_CNTS); - ScoreLBs[T] = Val; - } - - void setScoreUB(InstCounterType T, unsigned Val) { - assert(T < NUM_INST_CNTS); - ScoreUBs[T] = Val; - - if (T != EXP_CNT) - return; - - if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) - ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); - } - - void setRegScore(int GprNo, InstCounterType T, unsigned Val) { - setScoreByInterval({GprNo, GprNo + 1}, T, Val); - } - - void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, - unsigned Score); - - void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, - const MachineOperand &Op, InstCounterType CntTy, - unsigned Val); - - const GCNSubtarget *ST = nullptr; - InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; - HardwareLimits Limits = {}; - const unsigned *WaitEventMaskForInst; - InstCounterType SmemAccessCounter; - unsigned ScoreLBs[NUM_INST_CNTS] = {0}; - unsigned ScoreUBs[NUM_INST_CNTS] = {0}; - unsigned PendingEvents = 0; - // Remember the last flat memory operation. - unsigned LastFlat[NUM_INST_CNTS] = {0}; - // Remember the last GDS operation. - unsigned LastGDS = 0; - // wait_cnt scores for every vgpr. - // Keep track of the VgprUB and SgprUB to make merge at join efficient. - int VgprUB = -1; - int SgprUB = -1; - unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; - // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt - // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. - // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the - // X_CNT score. - unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; - // Bitmask of the VmemTypes of VMEM instructions that might have a pending - // write to each vgpr. - unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; - // Store representative LDS DMA operations. The only useful info here is - // alias info. One store is kept per unique AAInfo. - SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores; -}; +class WaitcntBrackets; // This abstracts the logic for generating and updating S_WAIT* instructions // away from the analysis that determines where they are needed. This was @@ -640,8 +407,13 @@ public: }; class SIInsertWaitcnts { +public: + const GCNSubtarget *ST; + InstCounterType SmemAccessCounter; + InstCounterType MaxCounter; + const unsigned *WaitEventMaskForInst; + private: - const GCNSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI = nullptr; @@ -657,8 +429,6 @@ private: bool Dirty = true; }; - InstCounterType SmemAccessCounter; - MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; bool ForceEmitWaitcnt[NUM_INST_CNTS]; @@ -675,7 +445,7 @@ private: // message. DenseSet<MachineInstr *> ReleaseVGPRInsts; - InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; + HardwareLimits Limits; public: SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, @@ -686,6 +456,30 @@ public: (void)ForceVMCounter; } + unsigned getWaitCountMax(InstCounterType T) const { + switch (T) { + case LOAD_CNT: + return Limits.LoadcntMax; + case DS_CNT: + return Limits.DscntMax; + case EXP_CNT: + return Limits.ExpcntMax; + case STORE_CNT: + return Limits.StorecntMax; + case SAMPLE_CNT: + return Limits.SamplecntMax; + case BVH_CNT: + return Limits.BvhcntMax; + case KM_CNT: + return Limits.KmcntMax; + case X_CNT: + return Limits.XcntMax; + default: + break; + } + return 0; + } + bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets); bool isPreheaderToFlush(MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets); @@ -791,6 +585,211 @@ public: WaitcntBrackets &ScoreBrackets); }; +// This objects maintains the current score brackets of each wait counter, and +// a per-register scoreboard for each wait counter. +// +// We also maintain the latest score for every event type that can change the +// waitcnt in order to know if there are multiple types of events within +// the brackets. When multiple types of event happen in the bracket, +// wait count may get decreased out of order, therefore we need to put in +// "s_waitcnt 0" before use. +class WaitcntBrackets { +public: + WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {} + + bool isSmemCounter(InstCounterType T) const { + return T == Context->SmemAccessCounter || T == X_CNT; + } + + unsigned getSgprScoresIdx(InstCounterType T) const { + assert(isSmemCounter(T) && "Invalid SMEM counter"); + return T == X_CNT ? 1 : 0; + } + + unsigned getScoreLB(InstCounterType T) const { + assert(T < NUM_INST_CNTS); + return ScoreLBs[T]; + } + + unsigned getScoreUB(InstCounterType T) const { + assert(T < NUM_INST_CNTS); + return ScoreUBs[T]; + } + + unsigned getScoreRange(InstCounterType T) const { + return getScoreUB(T) - getScoreLB(T); + } + + unsigned getRegScore(int GprNo, InstCounterType T) const { + if (GprNo < NUM_ALL_VGPRS) + return VgprScores[T][GprNo]; + return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; + } + + bool merge(const WaitcntBrackets &Other); + + RegInterval getRegInterval(const MachineInstr *MI, + const MachineRegisterInfo *MRI, + const SIRegisterInfo *TRI, + const MachineOperand &Op) const; + + bool counterOutOfOrder(InstCounterType T) const; + void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; + void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + + void determineWait(InstCounterType T, RegInterval Interval, + AMDGPU::Waitcnt &Wait) const; + void determineWait(InstCounterType T, int RegNo, + AMDGPU::Waitcnt &Wait) const { + determineWait(T, {RegNo, RegNo + 1}, Wait); + } + + void applyWaitcnt(const AMDGPU::Waitcnt &Wait); + void applyWaitcnt(InstCounterType T, unsigned Count); + void applyXcnt(const AMDGPU::Waitcnt &Wait); + void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, WaitEventType E, + MachineInstr &MI); + + unsigned hasPendingEvent() const { return PendingEvents; } + unsigned hasPendingEvent(WaitEventType E) const { + return PendingEvents & (1 << E); + } + unsigned hasPendingEvent(InstCounterType T) const { + unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T]; + assert((HasPending != 0) == (getScoreRange(T) != 0)); + return HasPending; + } + + bool hasMixedPendingEvents(InstCounterType T) const { + unsigned Events = hasPendingEvent(T); + // Return true if more than one bit is set in Events. + return Events & (Events - 1); + } + + bool hasPendingFlat() const { + return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && + LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || + (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && + LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); + } + + void setPendingFlat() { + LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; + LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; + } + + bool hasPendingGDS() const { + return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; + } + + unsigned getPendingGDSWait() const { + return std::min(getScoreUB(DS_CNT) - LastGDS, + Context->getWaitCountMax(DS_CNT) - 1); + } + + void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } + + // Return true if there might be pending writes to the vgpr-interval by VMEM + // instructions with types different from V. + bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + assert(RegNo < NUM_ALL_VGPRS); + if (VgprVmemTypes[RegNo] & ~(1 << V)) + return true; + } + return false; + } + + void clearVgprVmemTypes(RegInterval Interval) { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + assert(RegNo < NUM_ALL_VGPRS); + VgprVmemTypes[RegNo] = 0; + } + } + + void setStateOnFunctionEntryOrReturn() { + setScoreUB(STORE_CNT, + getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT)); + PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT]; + } + + ArrayRef<const MachineInstr *> getLDSDMAStores() const { + return LDSDMAStores; + } + + bool hasPointSampleAccel(const MachineInstr &MI) const; + bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, + RegInterval Interval) const; + + void print(raw_ostream &) const; + void dump() const { print(dbgs()); } + +private: + struct MergeInfo { + unsigned OldLB; + unsigned OtherLB; + unsigned MyShift; + unsigned OtherShift; + }; + static bool mergeScore(const MergeInfo &M, unsigned &Score, + unsigned OtherScore); + + void setScoreLB(InstCounterType T, unsigned Val) { + assert(T < NUM_INST_CNTS); + ScoreLBs[T] = Val; + } + + void setScoreUB(InstCounterType T, unsigned Val) { + assert(T < NUM_INST_CNTS); + ScoreUBs[T] = Val; + + if (T != EXP_CNT) + return; + + if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT)) + ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT); + } + + void setRegScore(int GprNo, InstCounterType T, unsigned Val) { + setScoreByInterval({GprNo, GprNo + 1}, T, Val); + } + + void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, + unsigned Score); + + void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, + const MachineOperand &Op, InstCounterType CntTy, + unsigned Val); + + const SIInsertWaitcnts *Context; + + unsigned ScoreLBs[NUM_INST_CNTS] = {0}; + unsigned ScoreUBs[NUM_INST_CNTS] = {0}; + unsigned PendingEvents = 0; + // Remember the last flat memory operation. + unsigned LastFlat[NUM_INST_CNTS] = {0}; + // Remember the last GDS operation. + unsigned LastGDS = 0; + // wait_cnt scores for every vgpr. + // Keep track of the VgprUB and SgprUB to make merge at join efficient. + int VgprUB = -1; + int SgprUB = -1; + unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; + // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt + // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. + // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the + // X_CNT score. + unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; + // Bitmask of the VmemTypes of VMEM instructions that might have a pending + // write to each vgpr. + unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; + // Store representative LDS DMA operations. The only useful info here is + // alias info. One store is kept per unique AAInfo. + SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores; +}; + class SIInsertWaitcntsLegacy : public MachineFunctionPass { public: static char ID; @@ -827,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, RegInterval Result; - MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST); + MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST); unsigned RegIdx = TRI->getHWRegIndex(MCReg); assert(isUInt<8>(RegIdx)); @@ -885,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI, // this at compile time, so we have to assume it might be applied if the // instruction supports it). bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const { - if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI)) + if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI)) return false; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); @@ -911,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &Inst) { - InstCounterType T = eventCounter(WaitEventMaskForInst, E); + InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E); unsigned UB = getScoreUB(T); unsigned CurrScore = UB + 1; @@ -1080,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } void WaitcntBrackets::print(raw_ostream &OS) const { + const GCNSubtarget *ST = Context->ST; + OS << '\n'; - for (auto T : inst_counter_types(MaxCounter)) { + for (auto T : inst_counter_types(Context->MaxCounter)) { unsigned SR = getScoreRange(T); switch (T) { @@ -1195,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, // s_waitcnt instruction. if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && - !ST->hasFlatLgkmVMemCountInOrder()) { + !Context->ST->hasFlatLgkmVMemCountInOrder()) { // If there is a pending FLAT operation, and this is a VMem or LGKM // waitcnt and the target can report early completion, then we need // to force a waitcnt 0. @@ -1209,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, // If a counter has been maxed out avoid overflow by waiting for // MAX(CounterType) - 1 instead. unsigned NeededWait = - std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); + std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1); addWait(Wait, T, NeededWait); } } @@ -1237,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { setScoreLB(T, std::max(getScoreLB(T), UB - Count)); } else { setScoreLB(T, UB); - PendingEvents &= ~WaitEventMaskForInst[T]; + PendingEvents &= ~Context->WaitEventMaskForInst[T]; } } @@ -1262,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { // the decrement may go out of order. bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { // Scalar memory read always can go out of order. - if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || + if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || (T == X_CNT && hasPendingEvent(SMEM_GROUP))) return true; return hasMixedPendingEvents(T); @@ -2386,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { VgprUB = std::max(VgprUB, Other.VgprUB); SgprUB = std::max(SgprUB, Other.SgprUB); - for (auto T : inst_counter_types(MaxCounter)) { + for (auto T : inst_counter_types(Context->MaxCounter)) { // Merge event flags for this counter + const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst; const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; if (OtherEvents & ~OldEvents) @@ -2746,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; - const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); + WaitEventMaskForInst = WCG->getWaitEventMask(); SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); - HardwareLimits Limits = {}; if (ST->hasExtendedWaitCounts()) { Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); Limits.DscntMax = AMDGPU::getDscntBitMask(IV); @@ -2807,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); } - auto NonKernelInitialState = std::make_unique<WaitcntBrackets>( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this); NonKernelInitialState->setStateOnFunctionEntryOrReturn(); BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); @@ -2839,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { *Brackets = *BI.Incoming; } else { if (!Brackets) { - Brackets = std::make_unique<WaitcntBrackets>( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + Brackets = std::make_unique<WaitcntBrackets>(this); } else { // Reinitialize in-place. N.B. do not do this by assigning from a // temporary because the WaitcntBrackets class is large and it could // cause this function to use an unreasonable amount of stack space. Brackets->~WaitcntBrackets(); - new (Brackets.get()) WaitcntBrackets( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + new (Brackets.get()) WaitcntBrackets(this); } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index a1e14d9..9da8a1c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6460,7 +6460,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { if (OldSAddrIdx < 0) return false; - assert(isSegmentSpecificFLAT(Inst)); + assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode())); int NewOpc = AMDGPU::getGlobalVaddrOp(Opc); if (NewOpc < 0) @@ -6484,7 +6484,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { if (OldVAddrIdx >= 0) { MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx); VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg()); - if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 || + if (!VAddrDef || !VAddrDef->isMoveImmediate() || !VAddrDef->getOperand(1).isImm() || VAddrDef->getOperand(1).getImm() != 0) return false; @@ -6537,7 +6537,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { // FIXME: Remove this when SelectionDAG is obsoleted. void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const { - if (!isSegmentSpecificFLAT(MI)) + if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode()) return; // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence @@ -10466,10 +10466,23 @@ bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const { return TargetInstrInfo::isGlobalMemoryObject(MI); } +bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const { + if (!isWMMA(MI) && !isSWMMAC(MI)) + return false; + + if (AMDGPU::isGFX1250(ST)) + return AMDGPU::getWMMAIsXDL(MI.getOpcode()); + + return true; +} + bool SIInstrInfo::isXDL(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); - if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) || + if (AMDGPU::isGFX12Plus(ST)) + return isDOT(MI) || isXDLWMMA(MI); + + if (!isMAI(MI) || isDGEMM(Opcode) || Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index a380199..3a48e65 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -867,6 +867,8 @@ public: return get(Opcode).TSFlags & SIInstrFlags::IsDOT; } + bool isXDLWMMA(const MachineInstr &MI) const; + bool isXDL(const MachineInstr &MI) const; static bool isDGEMM(unsigned Opcode) { return AMDGPU::getMAIIsDGEMM(Opcode); } diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index b0d6fd9..5097ac03 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -2225,8 +2225,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = MI.getIterator(); ++MBBI; - const SITargetLowering *TLI = - static_cast<const SITargetLowering *>(STM->getTargetLowering()); + const SITargetLowering *TLI = STM->getTargetLowering(); for ( ; MBBI != E; ++MBBI) { MachineInstr &MINext = *MBBI; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 9df2bde..7725881 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -296,6 +296,7 @@ unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) { #define GET_MIMGOffsetMappingTable_IMPL #define GET_MIMGG16MappingTable_IMPL #define GET_MAIInstInfoTable_IMPL +#define GET_WMMAInstInfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -568,6 +569,11 @@ bool getMAIIsGFX940XDL(unsigned Opc) { return Info && Info->is_gfx940_xdl; } +bool getWMMAIsXDL(unsigned Opc) { + const WMMAInstInfo *Info = getWMMAInstInfoHelper(Opc); + return Info ? Info->is_wmma_xdl : false; +} + uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) { switch (EncodingVal) { case MFMAScaleFormats::FP6_E2M3: diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 6708e0a..c9d2c28 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -119,6 +119,11 @@ struct True16D16Info { unsigned LoOp; }; +struct WMMAInstInfo { + uint16_t Opcode; + bool is_wmma_xdl; +}; + #define GET_MIMGBaseOpcode_DECL #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL @@ -129,6 +134,7 @@ struct True16D16Info { #define GET_isMFMA_F8F6F4Table_DECL #define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL #define GET_True16D16Table_DECL +#define GET_WMMAInstInfoTable_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -593,6 +599,9 @@ bool getMAIIsDGEMM(unsigned Opc); LLVM_READONLY bool getMAIIsGFX940XDL(unsigned Opc); +LLVM_READONLY +bool getWMMAIsXDL(unsigned Opc); + // Get an equivalent BitOp3 for a binary logical \p Opc. // \returns BitOp3 modifier for the logical operation or zero. // Used in VOPD3 conversion. diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index e2f3710..8c35fea 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>; let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>; defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>; + +let SubtargetPredicate = HasTanhInsts in +defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>; } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; @@ -531,6 +534,11 @@ let SubtargetPredicate = HasBF16TransInsts in { defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; +defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; +defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; +defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>; +defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>; +defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>; } } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; @@ -1133,6 +1141,7 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>; defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>; +defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>; defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; @@ -1141,6 +1150,11 @@ defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>; defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>; defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>; defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>; +defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>; +defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>; +defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>; +defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>; +defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 65d1c4e..fd3b052 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -3545,8 +3545,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); auto T = const_cast<Type*>(CP->getType()); auto C = const_cast<Constant*>(CP->getConstVal()); - auto M = const_cast<Module*>(DAG.getMachineFunction(). - getFunction().getParent()); + auto M = DAG.getMachineFunction().getFunction().getParent(); auto GV = new GlobalVariable( *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + @@ -21585,7 +21584,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -21593,6 +21592,11 @@ bool ARMTargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast<LoadInst>(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType()); Type *EltTy = VecTy->getElementType(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 5f4aef5..9159f3d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -681,7 +681,7 @@ class VectorType; unsigned getMaxSupportedInterleaveFactor() const override; - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp index ce43645..f0e2e78 100644 --- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp +++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp @@ -343,6 +343,16 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) { Info.RootFlattenedArrayType, Info.RootPointerOperand, {ZeroIndex, FlattenedIndex}, GEP.getName(), GEP.getNoWrapFlags()); + // If the pointer operand is a global variable and all indices are 0, + // IRBuilder::CreateGEP will return the global variable instead of creating + // a GEP instruction or GEP ConstantExpr. In this case we have to create and + // insert our own GEP instruction. + if (!isa<GEPOperator>(NewGEP)) + NewGEP = GetElementPtrInst::Create( + Info.RootFlattenedArrayType, Info.RootPointerOperand, + {ZeroIndex, FlattenedIndex}, GEP.getNoWrapFlags(), GEP.getName(), + Builder.GetInsertPoint()); + // Replace the current GEP with the new GEP. Store GEPInfo into the map // for later use in case this GEP was not the end of the chain GEPChainInfoMap.insert({cast<GEPOperator>(NewGEP), std::move(Info)}); diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp index c9ff713..c73648f 100644 --- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp +++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp @@ -563,7 +563,7 @@ legalizeGetHighLowi64Bytes(Instruction &I, } static void -legalizeLoadStoreOnArrayAllocas(Instruction &I, +legalizeScalarLoadStoreOnArrays(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &) { @@ -581,23 +581,31 @@ legalizeLoadStoreOnArrayAllocas(Instruction &I, } else return; - assert(LoadStoreTy->isSingleValueType() && - "Expected load/store type to be a single-valued type"); + // If the load/store is not of a single-value type (i.e., scalar or vector) + // then we do not modify it. It shouldn't be a vector either because the + // dxil-data-scalarization pass is expected to run before this, but it's not + // incorrect to apply this transformation to vector load/stores. + if (!LoadStoreTy->isSingleValueType()) + return; - auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp); - if (!AllocaPtrOp) + Type *ArrayTy; + if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp)) + ArrayTy = GlobalVarPtrOp->getValueType(); + else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp)) + ArrayTy = AllocaPtrOp->getAllocatedType(); + else return; - Type *Ty = AllocaPtrOp->getAllocatedType(); - if (!isa<ArrayType>(Ty)) + if (!isa<ArrayType>(ArrayTy)) return; - assert(!isa<ArrayType>(Ty->getArrayElementType()) && - "Expected allocated type of AllocaInst to be a flat ArrayType"); - IRBuilder<> Builder(&I); - Value *Zero = Builder.getInt32(0); - Value *GEP = Builder.CreateGEP(Ty, AllocaPtrOp, {Zero, Zero}, "", - GEPNoWrapFlags::all()); + assert(ArrayTy->getArrayElementType() == LoadStoreTy && + "Expected array element type to be the same as to the scalar load or " + "store type"); + + Value *Zero = ConstantInt::get(Type::getInt32Ty(I.getContext()), 0); + Value *GEP = GetElementPtrInst::Create( + ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator()); I.setOperand(PtrOpIndex, GEP); } @@ -651,7 +659,7 @@ private: // downcastI64toI32InsertExtractElements needs to handle. LegalizationPipeline[Stage2].push_back( downcastI64toI32InsertExtractElements); - LegalizationPipeline[Stage2].push_back(legalizeLoadStoreOnArrayAllocas); + LegalizationPipeline[Stage2].push_back(legalizeScalarLoadStoreOnArrays); } }; diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 53943de3..e285e04 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -1640,6 +1640,15 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) { R = N; break; } + case ISD::AssertSext: { + EVT T = cast<VTSDNode>(N.getOperand(1))->getVT(); + if (T.getSizeInBits() == 32) + R = N.getOperand(0); + else + return false; + break; + } + default: return false; } diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index d5a5f17..36c3011 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file describes the baisc single-precision floating-point instructions. +// This file describes the basic single-precision floating-point instructions. // //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index c47987f..2378664 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2597,12 +2597,9 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { EVT VecTy = Op->getOperand(0)->getValueType(0); SDValue Idx = Op->getOperand(1); - EVT EltTy = VecTy.getVectorElementType(); unsigned NumElts = VecTy.getVectorNumElements(); - if (isa<ConstantSDNode>(Idx) && - (EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 || - EltTy == MVT::f64 || Idx->getAsZExtVal() < NumElts / 2)) + if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts) return Op; return SDValue(); @@ -6003,10 +6000,9 @@ emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB, Register ScratchReg1 = XSrc; if (Idx >= HalfSize) { ScratchReg1 = MRI.createVirtualRegister(RC); - BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), ScratchReg1) + BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_D), ScratchReg1) .addReg(XSrc) - .addReg(XSrc) - .addImm(1); + .addImm(14); } Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC); diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 95e9fd4..a0107e4 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1282,6 +1282,32 @@ multiclass PatCCXrXrF<CondCode CC, string Inst> { (!cast<LAInst>(Inst#"_D") LASX256:$xj, LASX256:$xk)>; } +multiclass PairInsertExtractPatV8<ValueType vecty, ValueType elemty> { + foreach imm1 = 0...3 in { + foreach imm2 = 0...3 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert vecty:$xd, + (elemty (vector_extract vecty:$xj, imm1)), imm2), + (elemty (vector_extract vecty:$xj, !add(imm1, 4))), + !add(imm2, 4)), + (XVEXTRINS_W $xd, $xj, Imm)>; + } + } +} + +multiclass PairInsertExtractPatV4<ValueType vecty, ValueType elemty> { + foreach imm1 = 0...1 in { + foreach imm2 = 0...1 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert vecty:$xd, + (elemty (vector_extract vecty:$xj, imm1)), imm2), + (elemty (vector_extract vecty:$xj, !add(imm1, 2))), + !add(imm2, 2)), + (XVEXTRINS_D $xd, $xj, Imm)>; + } + } +} + let Predicates = [HasExtLASX] in { // XVADD_{B/H/W/D} @@ -1582,6 +1608,38 @@ defm : PatCCXrXrF<SETUNE, "XVFCMP_CUNE">; defm : PatCCXrXrF<SETO, "XVFCMP_COR">; defm : PatCCXrXrF<SETUO, "XVFCMP_CUN">; +// Insert two elements extracted from vector into vector. (The positions +// of the two elements must be same in the source or destination vector's +// front and back 128bits.) +// 2*XVPICKVE2GR_{W/D} + 2*XVINSGR2VR_{W/D} -> XVEXTRINS_{W/D} +// XVPERMI_D + 2*XVPICKVE2GR_{B/H} + 2*PseudoXVINSGR2VR_{B/H} -> XVEXTRINS_{W/D} +foreach imm1 = 0...15 in { + foreach imm2 = 0...15 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert v32i8:$xd, + (GRLenVT (vector_extract v32i8:$xj, imm1)), imm2), + (GRLenVT (vector_extract v32i8:$xj, !add(imm1, 16))), + !add(imm2, 16)), + (XVEXTRINS_B $xd, $xj, Imm)>; + } +} + +foreach imm1 = 0...7 in { + foreach imm2 = 0...7 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert v16i16:$xd, + (GRLenVT (vector_extract v16i16:$xj, imm1)), imm2), + (GRLenVT (vector_extract v16i16:$xj, !add(imm1, 8))), + !add(imm2, 8)), + (XVEXTRINS_H $xd, $xj, Imm)>; + } +} + +defm : PairInsertExtractPatV8<v8i32, GRLenVT>; +defm : PairInsertExtractPatV8<v8f32, f32>; +defm : PairInsertExtractPatV4<v4i64, GRLenVT>; +defm : PairInsertExtractPatV4<v4f64, f64>; + // PseudoXVINSGR2VR_{B/H} def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm), (PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>; @@ -1593,11 +1651,18 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm), (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>; def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm), (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>; - -def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm), - (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>; -def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm), - (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>; +def : Pat<(vector_insert v8f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm), + (XVINSGR2VR_W $vd, $rj, uimm3:$imm)>; +def : Pat<(vector_insert v4f64:$vd, (f64 (bitconvert i64:$rj)), uimm2:$imm), + (XVINSGR2VR_D $vd, $rj, uimm2:$imm)>; +def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2), + (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>; +def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2), + (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>; +def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm), + (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>; +def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm), + (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>; // scalar_to_vector def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)), @@ -1790,7 +1855,25 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in { def : RegRegStPat<store, XVSTX, LASX256, vt>; } +// Bitcast float/double element extracted from vector to integer. +def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v8f32:$xj, uimm3:$imm))), + (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm)>; +def : Pat<(i64 (bitconvert (f64 (vector_extract v4f64:$xj, uimm2:$imm)))), + (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm)>; + // Vector extraction with constant index. +foreach imm = 16...31 in { + defvar Imm = !and(imm, 15); + def : Pat<(i64 (vector_extract v32i8:$xj, imm)), + (VPICKVE2GR_B (EXTRACT_SUBREG (XVPERMI_D v32i8:$xj, 14), sub_128), + Imm)>; +} +foreach imm = 8...15 in { + defvar Imm = !and(imm, 7); + def : Pat<(i64 (vector_extract v16i16:$xj, imm)), + (VPICKVE2GR_H (EXTRACT_SUBREG (XVPERMI_D v16i16:$xj, 14), sub_128), + Imm)>; +} def : Pat<(i64 (vector_extract v32i8:$xj, uimm4:$imm)), (VPICKVE2GR_B (EXTRACT_SUBREG v32i8:$xj, sub_128), uimm4:$imm)>; def : Pat<(i64 (vector_extract v16i16:$xj, uimm3:$imm)), diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index d73d780..962e7c2 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1482,6 +1482,28 @@ multiclass VstelmPat<PatFrag StoreOp, ValueType vt, LAInst Inst, (Inst vt:$vd, BaseAddr:$rj, ImmOpnd:$imm, IdxOpnd:$idx)>; } +multiclass InsertExtractPatV4<ValueType vecty, ValueType elemty> { + foreach imm1 = 0...3 in { + foreach imm2 = 0...3 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert vecty:$vd, + (elemty (vector_extract vecty:$vj, imm1)), imm2), + (VEXTRINS_W $vd, $vj, Imm)>; + } + } +} + +multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> { + foreach imm1 = 0...1 in { + foreach imm2 = 0...1 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert vecty:$vd, + (elemty (vector_extract vecty:$vj, imm1)), imm2), + (VEXTRINS_D $vd, $vj, Imm)>; + } + } +} + let Predicates = [HasExtLSX] in { // VADD_{B/H/W/D} @@ -1782,6 +1804,31 @@ defm : PatCCVrVrF<SETUNE, "VFCMP_CUNE">; defm : PatCCVrVrF<SETO, "VFCMP_COR">; defm : PatCCVrVrF<SETUO, "VFCMP_CUN">; +// Insert element extracted from vector into vector. +// VPICKVE2GR_{B/H/W/D} + VINSGR2VR_{B/H/W/D} -> VEXTRINS_{B/H/W/D} +foreach imm1 = 0...15 in { + foreach imm2 = 0...15 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert v16i8:$vd, + (GRLenVT (vector_extract v16i8:$vj, imm1)), imm2), + (VEXTRINS_B $vd, $vj, Imm)>; + } +} + +foreach imm1 = 0...7 in { + foreach imm2 = 0...7 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert v8i16:$vd, + (GRLenVT (vector_extract v8i16:$vj, imm1)), imm2), + (VEXTRINS_H $vd, $vj, Imm)>; + } +} + +defm : InsertExtractPatV4<v4i32, GRLenVT>; +defm : InsertExtractPatV4<v4f32, f32>; +defm : InsertExtractPatV2<v2i64, GRLenVT>; +defm : InsertExtractPatV2<v2f64, f64>; + // VINSGR2VR_{B/H/W/D} def : Pat<(vector_insert v16i8:$vd, GRLenVT:$rj, uimm4:$imm), (VINSGR2VR_B v16i8:$vd, GRLenVT:$rj, uimm4:$imm)>; @@ -1791,7 +1838,10 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm), (VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>; def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm), (VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>; - +def : Pat<(vector_insert v4f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm2:$imm), + (VINSGR2VR_W $vd, $rj, uimm2:$imm)>; +def : Pat<(vector_insert v2f64:$vd, (f64 (bitconvert i64:$rj)), uimm1:$imm), + (VINSGR2VR_D $vd, $rj, uimm1:$imm)>; def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm), (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>; def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm), @@ -1990,6 +2040,12 @@ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { def : RegRegStPat<store, VSTX, LSX128, vt>; } +// Bitcast float/double element extracted from vector to integer. +def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v4f32:$vj, uimm2:$imm))), + (VPICKVE2GR_W v4f32:$vj, uimm2:$imm)>; +def : Pat<(i64 (bitconvert (f64 (vector_extract v2f64:$vj, uimm1:$imm)))), + (VPICKVE2GR_D v2f64:$vj, uimm1:$imm)>; + // Vector extraction with constant index. def : Pat<(i64 (vector_extract v16i8:$vj, uimm4:$imm)), (VPICKVE2GR_B v16i8:$vj, uimm4:$imm)>; diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 01e4d17..259b71b 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -2101,7 +2101,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, TOut.getStreamer().emitRelocDirective( *TmpExpr, inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR", - RelocJalrExpr, IDLoc, *STI); + RelocJalrExpr); TOut.getStreamer().emitLabel(TmpLabel); } diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index c18ba44..ca03310 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -166,7 +166,7 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI, OutStreamer.emitRelocDirective( *OffsetExpr, Subtarget.inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR", - CaleeExpr, SMLoc(), *TM.getMCSubtargetInfo()); + CaleeExpr); OutStreamer.emitLabel(OffsetLabel); return; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d017c65..7aa06f9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1048,9 +1048,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, MVT::v32i32, MVT::v64i32, MVT::v128i32}, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - // Enable custom lowering for the i128 bit operand with clusterlaunchcontrol - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i128, Custom); + // Enable custom lowering for the following: + // * MVT::i128 - clusterlaunchcontrol + // * MVT::i32 - prmt + // * MVT::Other - internal.addrspace.wrap + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other}, + Custom); } const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -2060,6 +2063,19 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getBuildVector(Node->getValueType(0), dl, Ops); } +static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, + SelectionDAG &DAG, + unsigned Mode = NVPTX::PTXPrmtMode::NONE) { + return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32, + {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)}); +} + +static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL, + SelectionDAG &DAG, + unsigned Mode = NVPTX::PTXPrmtMode::NONE) { + return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode); +} + SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { // Handle bitcasting from v2i8 without hitting the default promotion // strategy which goes through stack memory. @@ -2111,15 +2127,12 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32); R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32); } - return DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {L, R, DAG.getConstant(SelectionValue, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); + return getPRMT(L, R, SelectionValue, DL, DAG); }; auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340); auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340); auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410); - return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210); + return DAG.getBitcast(VT, PRMT3210); } // Get value or the Nth operand as an APInt(32). Undef values treated as 0. @@ -2176,11 +2189,14 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32, DAG.getZExtOrTrunc(Index, DL, MVT::i32), DAG.getConstant(0x7770, DL, MVT::i32)); - SDValue PRMT = DAG.getNode( - NVPTXISD::PRMT, DL, MVT::i32, - {DAG.getBitcast(MVT::i32, Vector), DAG.getConstant(0, DL, MVT::i32), - Selector, DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); - return DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0)); + SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector), + DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG); + SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0)); + SDNodeFlags Flags; + Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8); + Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8); + Ext->setFlags(Flags); + return Ext; } // Constant index will be matched by tablegen. @@ -2242,9 +2258,9 @@ SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, } SDLoc DL(Op); - return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2, - DAG.getConstant(Selector, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)); + SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1), + DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG); + return DAG.getBitcast(Op.getValueType(), PRMT); } /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift @@ -2729,10 +2745,46 @@ static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, {TryCancelResponse0, TryCancelResponse1}); } +static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) { + const unsigned Mode = [&]() { + switch (Op->getConstantOperandVal(0)) { + case Intrinsic::nvvm_prmt: + return NVPTX::PTXPrmtMode::NONE; + case Intrinsic::nvvm_prmt_b4e: + return NVPTX::PTXPrmtMode::B4E; + case Intrinsic::nvvm_prmt_ecl: + return NVPTX::PTXPrmtMode::ECL; + case Intrinsic::nvvm_prmt_ecr: + return NVPTX::PTXPrmtMode::ECR; + case Intrinsic::nvvm_prmt_f4e: + return NVPTX::PTXPrmtMode::F4E; + case Intrinsic::nvvm_prmt_rc16: + return NVPTX::PTXPrmtMode::RC16; + case Intrinsic::nvvm_prmt_rc8: + return NVPTX::PTXPrmtMode::RC8; + default: + llvm_unreachable("unsupported/unhandled intrinsic"); + } + }(); + SDLoc DL(Op); + SDValue A = Op->getOperand(1); + SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2) + : DAG.getConstant(0, DL, MVT::i32); + SDValue Selector = (Op->op_end() - 1)->get(); + return getPRMT(A, B, Selector, DL, DAG, Mode); +} static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) { switch (Op->getConstantOperandVal(0)) { default: return Op; + case Intrinsic::nvvm_prmt: + case Intrinsic::nvvm_prmt_b4e: + case Intrinsic::nvvm_prmt_ecl: + case Intrinsic::nvvm_prmt_ecr: + case Intrinsic::nvvm_prmt_f4e: + case Intrinsic::nvvm_prmt_rc16: + case Intrinsic::nvvm_prmt_rc8: + return lowerPrmtIntrinsic(Op, DAG); case Intrinsic::nvvm_internal_addrspace_wrap: return Op.getOperand(1); case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled: @@ -5775,11 +5827,10 @@ PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDLoc DL(N); auto &DAG = DCI.DAG; - auto PRMT = DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); - return DAG.getNode(ISD::BITCAST, DL, VT, PRMT); + auto PRMT = + getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1), + (Op1Bytes << 8) | Op0Bytes, DL, DAG); + return DAG.getBitcast(VT, PRMT); } static SDValue combineADDRSPACECAST(SDNode *N, @@ -5797,47 +5848,120 @@ static SDValue combineADDRSPACECAST(SDNode *N, return SDValue(); } +// Given a constant selector value and a prmt mode, return the selector value +// normalized to the generic prmt mode. See the PTX ISA documentation for more +// details: +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt +static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) { + if (Mode == NVPTX::PTXPrmtMode::NONE) + return Selector; + + const unsigned V = Selector.trunc(2).getZExtValue(); + + const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2, + unsigned S3) { + return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12)); + }; + + switch (Mode) { + case NVPTX::PTXPrmtMode::F4E: + return GetSelector(V, V + 1, V + 2, V + 3); + case NVPTX::PTXPrmtMode::B4E: + return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7); + case NVPTX::PTXPrmtMode::RC8: + return GetSelector(V, V, V, V); + case NVPTX::PTXPrmtMode::ECL: + return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U); + case NVPTX::PTXPrmtMode::ECR: + return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V); + case NVPTX::PTXPrmtMode::RC16: { + unsigned V1 = (V & 1) << 1; + return GetSelector(V1, V1 + 1, V1, V1 + 1); + } + default: + llvm_unreachable("Invalid PRMT mode"); + } +} + +static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) { + // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}} + APInt BitField = B.concat(A); + APInt SelectorVal = getPRMTSelector(Selector, Mode); + APInt Result(32, 0); + for (unsigned I : llvm::seq(4U)) { + APInt Sel = SelectorVal.extractBits(4, I * 4); + unsigned Idx = Sel.getLoBits(3).getZExtValue(); + unsigned Sign = Sel.getHiBits(1).getZExtValue(); + APInt Byte = BitField.extractBits(8, Idx * 8); + if (Sign) + Byte = Byte.ashr(8); + Result.insertBits(Byte, I * 8); + } + return Result; +} + +static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + CodeGenOptLevel OptLevel) { + if (OptLevel == CodeGenOptLevel::None) + return SDValue(); + + // Constant fold PRMT + if (isa<ConstantSDNode>(N->getOperand(0)) && + isa<ConstantSDNode>(N->getOperand(1)) && + isa<ConstantSDNode>(N->getOperand(2))) + return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0), + N->getConstantOperandAPInt(1), + N->getConstantOperandAPInt(2), + N->getConstantOperandVal(3)), + SDLoc(N), N->getValueType(0)); + + return SDValue(); +} + SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel(); switch (N->getOpcode()) { - default: break; - case ISD::ADD: - return PerformADDCombine(N, DCI, OptLevel); - case ISD::FADD: - return PerformFADDCombine(N, DCI, OptLevel); - case ISD::MUL: - return PerformMULCombine(N, DCI, OptLevel); - case ISD::SHL: - return PerformSHLCombine(N, DCI, OptLevel); - case ISD::AND: - return PerformANDCombine(N, DCI); - case ISD::UREM: - case ISD::SREM: - return PerformREMCombine(N, DCI, OptLevel); - case ISD::SETCC: - return PerformSETCCCombine(N, DCI, STI.getSmVersion()); - case ISD::LOAD: - case NVPTXISD::LoadParamV2: - case NVPTXISD::LoadV2: - case NVPTXISD::LoadV4: - return combineUnpackingMovIntoLoad(N, DCI); - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - return PerformStoreParamCombine(N, DCI); - case ISD::STORE: - case NVPTXISD::StoreV2: - case NVPTXISD::StoreV4: - return PerformStoreCombine(N, DCI); - case ISD::EXTRACT_VECTOR_ELT: - return PerformEXTRACTCombine(N, DCI); - case ISD::VSELECT: - return PerformVSELECTCombine(N, DCI); - case ISD::BUILD_VECTOR: - return PerformBUILD_VECTORCombine(N, DCI); - case ISD::ADDRSPACECAST: - return combineADDRSPACECAST(N, DCI); + default: + break; + case ISD::ADD: + return PerformADDCombine(N, DCI, OptLevel); + case ISD::ADDRSPACECAST: + return combineADDRSPACECAST(N, DCI); + case ISD::AND: + return PerformANDCombine(N, DCI); + case ISD::BUILD_VECTOR: + return PerformBUILD_VECTORCombine(N, DCI); + case ISD::EXTRACT_VECTOR_ELT: + return PerformEXTRACTCombine(N, DCI); + case ISD::FADD: + return PerformFADDCombine(N, DCI, OptLevel); + case ISD::LOAD: + case NVPTXISD::LoadParamV2: + case NVPTXISD::LoadV2: + case NVPTXISD::LoadV4: + return combineUnpackingMovIntoLoad(N, DCI); + case ISD::MUL: + return PerformMULCombine(N, DCI, OptLevel); + case NVPTXISD::PRMT: + return combinePRMT(N, DCI, OptLevel); + case ISD::SETCC: + return PerformSETCCCombine(N, DCI, STI.getSmVersion()); + case ISD::SHL: + return PerformSHLCombine(N, DCI, OptLevel); + case ISD::SREM: + case ISD::UREM: + return PerformREMCombine(N, DCI, OptLevel); + case NVPTXISD::StoreParam: + case NVPTXISD::StoreParamV2: + case NVPTXISD::StoreParamV4: + return PerformStoreParamCombine(N, DCI); + case ISD::STORE: + case NVPTXISD::StoreV2: + case NVPTXISD::StoreV4: + return PerformStoreCombine(N, DCI); + case ISD::VSELECT: + return PerformVSELECTCombine(N, DCI); } return SDValue(); } @@ -6387,7 +6511,7 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2)); unsigned Mode = Op.getConstantOperandVal(3); - if (Mode != NVPTX::PTXPrmtMode::NONE || !Selector) + if (!Selector) return; KnownBits AKnown = DAG.computeKnownBits(A, Depth); @@ -6396,7 +6520,7 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}} KnownBits BitField = BKnown.concat(AKnown); - APInt SelectorVal = Selector->getAPIntValue(); + APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode); for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) { APInt Sel = SelectorVal.extractBits(4, I * 4); unsigned Idx = Sel.getLoBits(3).getZExtValue(); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 4eef6c9..a5bb83d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1453,18 +1453,33 @@ let hasSideEffects = false in { (ins PrmtMode:$mode), "prmt.b32$mode", [(set i32:$d, (prmt i32:$a, i32:$b, imm:$c, imm:$mode))]>; + def PRMT_B32rir + : BasicFlagsNVPTXInst<(outs B32:$d), + (ins B32:$a, i32imm:$b, B32:$c), + (ins PrmtMode:$mode), + "prmt.b32$mode", + [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>; def PRMT_B32rii : BasicFlagsNVPTXInst<(outs B32:$d), (ins B32:$a, i32imm:$b, Hexu32imm:$c), (ins PrmtMode:$mode), "prmt.b32$mode", [(set i32:$d, (prmt i32:$a, imm:$b, imm:$c, imm:$mode))]>; - def PRMT_B32rir + def PRMT_B32irr : BasicFlagsNVPTXInst<(outs B32:$d), - (ins B32:$a, i32imm:$b, B32:$c), - (ins PrmtMode:$mode), + (ins i32imm:$a, B32:$b, B32:$c), (ins PrmtMode:$mode), + "prmt.b32$mode", + [(set i32:$d, (prmt imm:$a, i32:$b, i32:$c, imm:$mode))]>; + def PRMT_B32iri + : BasicFlagsNVPTXInst<(outs B32:$d), + (ins i32imm:$a, B32:$b, Hexu32imm:$c), (ins PrmtMode:$mode), + "prmt.b32$mode", + [(set i32:$d, (prmt imm:$a, i32:$b, imm:$c, imm:$mode))]>; + def PRMT_B32iir + : BasicFlagsNVPTXInst<(outs B32:$d), + (ins i32imm:$a, i32imm:$b, B32:$c), (ins PrmtMode:$mode), "prmt.b32$mode", - [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>; + [(set i32:$d, (prmt imm:$a, imm:$b, i32:$c, imm:$mode))]>; } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index bad4c3c..70150bd 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1047,24 +1047,6 @@ class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass, // MISC // -class PRMT3Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode> - : Pat<(prmt_intrinsic i32:$a, i32:$b, i32:$c), - (PRMT_B32rrr $a, $b, $c, prmt_mode)>; - -class PRMT2Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode> - : Pat<(prmt_intrinsic i32:$a, i32:$c), - (PRMT_B32rir $a, (i32 0), $c, prmt_mode)>; - -def : PRMT3Pat<int_nvvm_prmt, PrmtNONE>; -def : PRMT3Pat<int_nvvm_prmt_f4e, PrmtF4E>; -def : PRMT3Pat<int_nvvm_prmt_b4e, PrmtB4E>; - -def : PRMT2Pat<int_nvvm_prmt_rc8, PrmtRC8>; -def : PRMT2Pat<int_nvvm_prmt_ecl, PrmtECL>; -def : PRMT2Pat<int_nvvm_prmt_ecr, PrmtECR>; -def : PRMT2Pat<int_nvvm_prmt_rc16, PrmtRC16>; - - def INT_NVVM_NANOSLEEP_I : BasicNVPTXInst<(outs), (ins i32imm:$i), "nanosleep.u32", [(int_nvvm_nanosleep imm:$i)]>, Requires<[hasPTX<63>, hasSM<70>]>; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 0f948b2..cfec46d2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3058,17 +3058,28 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr, }; if (auto *C1 = dyn_cast<ConstantSDNode>(RHS)) { + // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2)) if (LHS.getOpcode() == ISD::ADD && - SelectShl(LHS.getOperand(0), Index, Scale) && !isa<ConstantSDNode>(LHS.getOperand(1)) && isInt<12>(C1->getSExtValue())) { - // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2)) - SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(), - SDLoc(Addr), VT); - Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, - LHS.getOperand(1), C1Val), - 0); - return true; + if (SelectShl(LHS.getOperand(1), Index, Scale)) { + SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(), + SDLoc(Addr), VT); + Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, + LHS.getOperand(0), C1Val), + 0); + return true; + } + + // Add is commutative so we need to check both operands. + if (SelectShl(LHS.getOperand(0), Index, Scale)) { + SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(), + SDLoc(Addr), VT); + Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, + LHS.getOperand(1), C1Val), + 0); + return true; + } } // Don't match add with constants. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 3af729a..e0a8c07 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -429,7 +429,7 @@ public: bool fallBackToDAGISel(const Instruction &Inst) const override; - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; @@ -437,15 +437,12 @@ public: bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool lowerDeinterleaveIntrinsicToLoad( - Instruction *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveValues) const override; + bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, + IntrinsicInst *DI) const override; bool lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override; - - bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveRes) const override; + Instruction *Store, Value *Mask, + ArrayRef<Value *> InterleaveValues) const override; bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, ArrayRef<Value *> InterleaveOps) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index ddfacd9..38cc0ce 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -14,6 +14,7 @@ #include "RISCVISelLowering.h" #include "RISCVSubtarget.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -68,6 +69,39 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = { Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, Intrinsic::riscv_vlseg8_mask}; +static const Intrinsic::ID FixedVssegIntrIds[] = { + Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, + Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, + Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, + Intrinsic::riscv_seg8_store_mask}; + +static const Intrinsic::ID ScalableVssegIntrIds[] = { + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask}; + +static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { + assert(N); + if (N == 1) + return true; + + using namespace PatternMatch; + // Right now we're only recognizing the simplest pattern. + uint64_t C; + if (match(V, m_CombineOr(m_ConstantInt(C), + m_NUWMul(m_Value(), m_ConstantInt(C)))) && + C && C % N == 0) + return true; + + if (isPowerOf2_32(N)) { + KnownBits KB = llvm::computeKnownBits(V, DL); + return KB.countMinTrailingZeros() >= Log2_32(N); + } + + return false; +} + /// Lower an interleaved load into a vlsegN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -81,21 +115,49 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool RISCVTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Indices.size() == Shuffles.size()); - IRBuilder<> Builder(LI); - - const DataLayout &DL = LI->getDataLayout(); + IRBuilder<> Builder(Load); + const DataLayout &DL = Load->getDataLayout(); auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType()); - if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(), - LI->getPointerAddressSpace(), DL)) - return false; + auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); + + Value *Ptr, *VL; + Align Alignment; + if (auto *LI = dyn_cast<LoadInst>(Load)) { + assert(LI->isSimple()); + Ptr = LI->getPointerOperand(); + Alignment = LI->getAlign(); + assert(!Mask && "Unexpected mask on a load\n"); + Mask = Builder.getAllOnesMask(VTy->getElementCount()); + VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount()); + } else { + auto *VPLoad = cast<VPIntrinsic>(Load); + assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load && + "Unexpected intrinsic"); + Ptr = VPLoad->getMemoryPointerParam(); + Alignment = VPLoad->getPointerAlignment().value_or( + DL.getABITypeAlign(VTy->getElementType())); - auto *PtrTy = LI->getPointerOperandType(); - auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + assert(Mask && "vp.load needs a mask!"); + + Value *WideEVL = VPLoad->getVectorLengthParam(); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, DL, Factor)) + return false; + + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); + } + + Type *PtrTy = Ptr->getType(); + unsigned AS = PtrTy->getPointerAddressSpace(); + if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) + return false; // If the segment load is going to be performed segment at a time anyways // and there's only one element used, use a strided load instead. This @@ -104,26 +166,23 @@ bool RISCVTargetLowering::lowerInterleavedLoad( unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); - Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(), - VTy->getElementCount()); - + Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); + // Note: Same VL as above, but i32 not xlen due to signature of + // vp.strided.load + VL = Builder.CreateElementCount(Builder.getInt32Ty(), + VTy->getElementCount()); CallInst *CI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, {VTy, BasePtr->getType(), Stride->getType()}, {BasePtr, Stride, Mask, VL}); - CI->addParamAttr( - 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); + CI->addParamAttr(0, + Attribute::getWithAlignment(CI->getContext(), Alignment)); Shuffles[0]->replaceAllUsesWith(CI); return true; }; - Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount()); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); CallInst *VlsegN = Builder.CreateIntrinsic( - FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, - {LI->getPointerOperand(), Mask, VL}); + FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); for (unsigned i = 0; i < Shuffles.size(); i++) { Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); @@ -133,18 +192,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad( return true; } -static const Intrinsic::ID FixedVssegIntrIds[] = { - Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, - Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, - Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, - Intrinsic::riscv_seg8_store_mask}; - -static const Intrinsic::ID ScalableVssegIntrIds[] = { - Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, - Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, - Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, - Intrinsic::riscv_vsseg8_mask}; - /// Lower an interleaved store into a vssegN intrinsic. /// /// E.g. Lower an interleaved store (Factor = 3): @@ -234,39 +281,15 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, return true; } -static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { - assert(N); - if (N == 1) - return true; - - using namespace PatternMatch; - // Right now we're only recognizing the simplest pattern. - uint64_t C; - if (match(V, m_CombineOr(m_ConstantInt(C), - m_c_Mul(m_Value(), m_ConstantInt(C)))) && - C && C % N == 0) - return true; - - if (isPowerOf2_32(N)) { - KnownBits KB = llvm::computeKnownBits(V, DL); - return KB.countMinTrailingZeros() >= Log2_32(N); - } - - return false; -} - bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( - Instruction *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveValues) const { - const unsigned Factor = DeinterleaveValues.size(); + Instruction *Load, Value *Mask, IntrinsicInst *DI) const { + const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); if (Factor > 8) return false; IRBuilder<> Builder(Load); - Value *FirstActive = - *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; }); - VectorType *ResVTy = cast<VectorType>(FirstActive->getType()); + VectorType *ResVTy = getDeinterleavedVectorType(DI); const DataLayout &DL = Load->getDataLayout(); auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); @@ -298,10 +321,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) return false; - VL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, - ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); } Type *PtrTy = Ptr->getType(); @@ -346,61 +367,74 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( } } - for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) { - if (!DIV) - continue; - // We have to create a brand new ExtractValue to replace each - // of these old ExtractValue instructions. - Value *NewEV = - Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)}); - DIV->replaceAllUsesWith(NewEV); - } - + DI->replaceAllUsesWith(Return); return true; } bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef<Value *> InterleaveValues) const { + Instruction *Store, Value *Mask, ArrayRef<Value *> InterleaveValues) const { unsigned Factor = InterleaveValues.size(); if (Factor > 8) return false; - assert(SI->isSimple()); - IRBuilder<> Builder(SI); + IRBuilder<> Builder(Store); auto *InVTy = cast<VectorType>(InterleaveValues[0]->getType()); - auto *PtrTy = SI->getPointerOperandType(); - const DataLayout &DL = SI->getDataLayout(); + const DataLayout &DL = Store->getDataLayout(); + Type *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); - if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(), - SI->getPointerAddressSpace(), DL)) - return false; + Value *Ptr, *VL; + Align Alignment; + if (auto *SI = dyn_cast<StoreInst>(Store)) { + assert(SI->isSimple()); + Ptr = SI->getPointerOperand(); + Alignment = SI->getAlign(); + assert(!Mask && "Unexpected mask on a store"); + Mask = Builder.getAllOnesMask(InVTy->getElementCount()); + VL = isa<FixedVectorType>(InVTy) + ? Builder.CreateElementCount(XLenTy, InVTy->getElementCount()) + : Constant::getAllOnesValue(XLenTy); + } else { + auto *VPStore = cast<VPIntrinsic>(Store); + assert(VPStore->getIntrinsicID() == Intrinsic::vp_store && + "Unexpected intrinsic"); + Ptr = VPStore->getMemoryPointerParam(); + Alignment = VPStore->getPointerAlignment().value_or( + DL.getABITypeAlign(InVTy->getElementType())); + + assert(Mask && "vp.store needs a mask!"); - Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); + Value *WideEVL = VPStore->getVectorLengthParam(); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, DL, Factor)) + return false; + + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); + } + Type *PtrTy = Ptr->getType(); + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + if (!isLegalInterleavedAccessType(InVTy, Factor, Alignment, AS, DL)) + return false; if (isa<FixedVectorType>(InVTy)) { Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, PtrTy, XLenTy}); - + Store->getModule(), FixedVssegIntrIds[Factor - 2], + {InVTy, PtrTy, XLenTy}); SmallVector<Value *, 10> Ops(InterleaveValues); - Value *VL = Builder.CreateElementCount(XLenTy, InVTy->getElementCount()); - Value *Mask = Builder.getAllOnesMask(InVTy->getElementCount()); - Ops.append({SI->getPointerOperand(), Mask, VL}); - + Ops.append({Ptr, Mask, VL}); Builder.CreateCall(VssegNFunc, Ops); return true; } unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType()); unsigned NumElts = InVTy->getElementCount().getKnownMinValue(); Type *VecTupTy = TargetExtType::get( - SI->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(SI->getContext()), + Store->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(Store->getContext()), NumElts * SEW / 8), Factor); - Value *VL = Constant::getAllOnesValue(XLenTy); - Value *Mask = Builder.getAllOnesMask(InVTy->getElementCount()); - Value *StoredVal = PoisonValue::get(VecTupTy); for (unsigned i = 0; i < Factor; ++i) StoredVal = Builder.CreateIntrinsic( @@ -408,131 +442,15 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( {StoredVal, InterleaveValues[i], Builder.getInt32(i)}); Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), ScalableVssegIntrIds[Factor - 2], + Store->getModule(), ScalableVssegIntrIds[Factor - 2], {VecTupTy, PtrTy, Mask->getType(), VL->getType()}); - Value *Operands[] = {StoredVal, SI->getPointerOperand(), Mask, VL, + Value *Operands[] = {StoredVal, Ptr, Mask, VL, ConstantInt::get(XLenTy, Log2_64(SEW))}; Builder.CreateCall(VssegNFunc, Operands); return true; } -/// Lower an interleaved vp.load into a vlsegN intrinsic. -/// -/// E.g. Lower an interleaved vp.load (Factor = 2): -/// %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr, -/// %mask, -/// i32 %wide.rvl) -/// %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> } -/// @llvm.vector.deinterleave2.nxv64i8( -/// <vscale x 64 x i8> %l) -/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0 -/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1 -/// -/// Into: -/// %rvl = udiv %wide.rvl, 2 -/// %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> } -/// @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef, -/// <vscale x 32 x i8> undef, -/// ptr %ptr, -/// %mask, -/// i64 %rvl, -/// i64 1) -/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0 -/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1 -/// -/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be -/// removed by the caller -/// TODO: We probably can loosen the dependency on matching extractvalue when -/// dealing with factor of 2 (extractvalue is still required for most of other -/// factors though). -bool RISCVTargetLowering::lowerInterleavedVPLoad( - VPIntrinsic *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveResults) const { - const unsigned Factor = DeinterleaveResults.size(); - assert(Mask && "Expect a valid mask"); - assert(Load->getIntrinsicID() == Intrinsic::vp_load && - "Unexpected intrinsic"); - - Value *FirstActive = *llvm::find_if(DeinterleaveResults, - [](Value *V) { return V != nullptr; }); - VectorType *VTy = cast<VectorType>(FirstActive->getType()); - - auto &DL = Load->getModule()->getDataLayout(); - Align Alignment = Load->getParamAlign(0).value_or( - DL.getABITypeAlign(VTy->getElementType())); - if (!isLegalInterleavedAccessType( - VTy, Factor, Alignment, - Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) - return false; - - IRBuilder<> Builder(Load); - - Value *WideEVL = Load->getVectorLengthParam(); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) - return false; - - auto *PtrTy = Load->getArgOperand(0)->getType(); - auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); - Value *EVL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); - - Value *Return = nullptr; - if (isa<FixedVectorType>(VTy)) { - Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], - {VTy, PtrTy, XLenTy}, - {Load->getArgOperand(0), Mask, EVL}); - } else { - unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); - unsigned NumElts = VTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - Load->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), - NumElts * SEW / 8), - Factor); - - Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), ScalableVlsegIntrIds[Factor - 2], - {VecTupTy, PtrTy, Mask->getType(), EVL->getType()}); - - Value *Operands[] = { - PoisonValue::get(VecTupTy), - Load->getArgOperand(0), - Mask, - EVL, - ConstantInt::get(XLenTy, - RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC), - ConstantInt::get(XLenTy, Log2_64(SEW))}; - - CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); - - SmallVector<Type *, 8> AggrTypes{Factor, VTy}; - Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); - Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); - for (unsigned i = 0; i < Factor; ++i) { - Value *VecExtract = - Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); - Return = Builder.CreateInsertValue(Return, VecExtract, i); - } - } - - for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) { - if (!DIO) - continue; - // We have to create a brand new ExtractValue to replace each - // of these old ExtractValue instructions. - Value *NewEV = - Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)}); - DIO->replaceAllUsesWith(NewEV); - } - - return true; -} - /// Lower an interleaved vp.store into a vssegN intrinsic. /// /// E.g. Lower an interleaved vp.store (Factor = 2): @@ -583,9 +501,9 @@ bool RISCVTargetLowering::lowerInterleavedVPStore( auto *PtrTy = Store->getArgOperand(1)->getType(); auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); - Value *EVL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + Value *EVL = + Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); if (isa<FixedVectorType>(VTy)) { SmallVector<Value *, 8> Operands(InterleaveOperands); diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index 05388f2..3e286a7 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -13,6 +13,17 @@ // //===----------------------------------------------------------------------===// +class SMX60IsWorstCaseMX<string mx, list<string> MxList> { + string LLMUL = LargestLMUL<MxList>.r; + bit c = !eq(mx, LLMUL); +} + +class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> { + string LLMUL = LargestLMUL<MxList>.r; + int SSEW = SmallestSEW<mx, isF>.r; + bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW)); +} + def SpacemitX60Model : SchedMachineModel { let IssueWidth = 2; // dual-issue let MicroOpBufferSize = 0; // in-order @@ -44,6 +55,19 @@ let BufferSize = 0 in { // floating point instructions, this model assumes single issue as // increasing it reduces the gains we saw in performance def SMX60_FP : ProcResource<1>; + + // Vector pipeline + // Single issue for vector store/load instructions + def SMX60_VLS : ProcResource<1>; + + // The C908 user manual says: "Vector floating-point units support vector + // floating-point computation of different bits. In addition, vector integer + // units are added". Developer confirmed it's a separate VIEU + def SMX60_VIEU : ProcResource<1>; + + // The C908 user manual says: "The vector execution unit is developed by + // extending the floating-point unit", so let's assume single issue for now + def SMX60_VFP : ProcResource<1>; } //===----------------------------------------------------------------------===// @@ -232,9 +256,341 @@ let Latency = 4 in { def : WriteRes<WriteFMovI32ToF32, [SMX60_IEU]>; } +// 6. Configuration-Setting Instructions +def : WriteRes<WriteVSETVLI, [SMX60_IEUA]>; +def : WriteRes<WriteVSETIVLI, [SMX60_IEUA]>; +def : WriteRes<WriteVSETVL, [SMX60_IEUA]>; + +// 7. Vector Loads and Stores +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + // Unit-stride loads and stores + defm "" : LMULWriteResMX<"WriteVLDE", [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDFF", [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTE", [SMX60_VLS], mx, IsWorstCase>; + + // Mask loads and stores + defm "" : LMULWriteResMX<"WriteVLDM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>; + defm "" : LMULWriteResMX<"WriteVSTM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>; + + // Strided and indexed loads and stores + foreach eew = [8, 16, 32, 64] in { + defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SMX60_VLS], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SMX60_VLS], mx, IsWorstCase>; + } +} + +// Segmented loads and stores +foreach mx = SchedMxList in { + foreach nf=2-8 in { + foreach eew = [8, 16, 32, 64] in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + // Unit-stride segmented + defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + + // Strided/indexed segmented + defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + + // Indexed segmented + defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + } + } +} + +// Whole register move/load/store +foreach LMul = [1, 2, 4, 8] in { + def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SMX60_VLS]>; + def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SMX60_VLS]>; + + def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SMX60_VIEU]>; +} + +// 11. Vector Integer Arithmetic Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>; +} + +// Widening +foreach mx = SchedMxListW in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; + + defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>; +} + +// Vector Integer Division and Remainder +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet<mx>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +// Narrowing Shift and Clips +foreach mx = SchedMxListW in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; + + defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>; +} + +// 12. Vector Fixed-Point Arithmetic Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>; +} + +// 13. Vector Floating-Point Instructions +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, isF=1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, isF=1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; +} + +// Widening +foreach mx = SchedMxListW in { + foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListFW in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c; + + defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; +} + +foreach mx = SchedMxListFW in { + foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// Narrowing +foreach mx = SchedMxListW in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; + + defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; +} + +foreach mx = SchedMxListFW in { + foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { + + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; + defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// Vector Floating-Point Division and Square Root +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// 14. Vector Reduction Operations +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet<mx>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListWRed in { + foreach sew = SchedSEWSet<mx, 0, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListFWRed in { + foreach sew = SchedSEWSet<mx, 1, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// 15. Vector Mask Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVMALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMPopV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMFFSV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMSFSV", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVIotaV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIdxV", [SMX60_VIEU], mx, IsWorstCase>; +} + +// 16. Vector Permutation Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>; +} + +def : WriteRes<WriteVMovXS, [SMX60_VIEU]>; +def : WriteRes<WriteVMovSX, [SMX60_VIEU]>; + +def : WriteRes<WriteVMovFS, [SMX60_VIEU]>; +def : WriteRes<WriteVMovSF, [SMX60_VIEU]>; + +// Gather and Compress +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet<mx>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>; +} + // Others def : WriteRes<WriteCSR, [SMX60_IEU]>; def : WriteRes<WriteNop, [SMX60_IEU]>; +def : WriteRes<WriteRdVLENB, [SMX60_IEUA]>; //===----------------------------------------------------------------------===// // Bypass and advance @@ -341,10 +697,184 @@ def : ReadAdvance<ReadCLMUL, 0>; def : ReadAdvance<ReadSingleBit, 0>; def : ReadAdvance<ReadSingleBitImm, 0>; +// 6. Configuration-Setting Instructions +def : ReadAdvance<ReadVSETVLI, 0>; +def : ReadAdvance<ReadVSETVL, 0>; + +// 7. Vector Loads and Stores +def : ReadAdvance<ReadVLDX, 0>; +def : ReadAdvance<ReadVSTX, 0>; +defm "" : LMULReadAdvance<"ReadVSTEV", 0>; +defm "" : LMULReadAdvance<"ReadVSTM", 0>; +def : ReadAdvance<ReadVLDSX, 0>; +def : ReadAdvance<ReadVSTSX, 0>; +defm "" : LMULReadAdvance<"ReadVSTS8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS64V", 0>; +defm "" : LMULReadAdvance<"ReadVLDUXV", 0>; +defm "" : LMULReadAdvance<"ReadVLDOXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX8", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX16", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX32", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX64", 0>; +defm "" : LMULReadAdvance<"ReadVSTUXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX64V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX8", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX16", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX32", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX64", 0>; +defm "" : LMULReadAdvance<"ReadVSTOXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX64V", 0>; +// LMUL Aware +def : ReadAdvance<ReadVST1R, 0>; +def : ReadAdvance<ReadVST2R, 0>; +def : ReadAdvance<ReadVST4R, 0>; +def : ReadAdvance<ReadVST8R, 0>; + +// 12. Vector Integer Arithmetic Instructions +defm : LMULReadAdvance<"ReadVIALUV", 0>; +defm : LMULReadAdvance<"ReadVIALUX", 0>; +defm : LMULReadAdvanceW<"ReadVIWALUV", 0>; +defm : LMULReadAdvanceW<"ReadVIWALUX", 0>; +defm : LMULReadAdvance<"ReadVExtV", 0>; +defm : LMULReadAdvance<"ReadVICALUV", 0>; +defm : LMULReadAdvance<"ReadVICALUX", 0>; +defm : LMULReadAdvance<"ReadVShiftV", 0>; +defm : LMULReadAdvance<"ReadVShiftX", 0>; +defm : LMULReadAdvanceW<"ReadVNShiftV", 0>; +defm : LMULReadAdvanceW<"ReadVNShiftX", 0>; +defm : LMULReadAdvance<"ReadVICmpV", 0>; +defm : LMULReadAdvance<"ReadVICmpX", 0>; +defm : LMULReadAdvance<"ReadVIMinMaxV", 0>; +defm : LMULReadAdvance<"ReadVIMinMaxX", 0>; +defm : LMULReadAdvance<"ReadVIMulV", 0>; +defm : LMULReadAdvance<"ReadVIMulX", 0>; +defm : LMULSEWReadAdvance<"ReadVIDivV", 0>; +defm : LMULSEWReadAdvance<"ReadVIDivX", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulV", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulX", 0>; +defm : LMULReadAdvance<"ReadVIMulAddV", 0>; +defm : LMULReadAdvance<"ReadVIMulAddX", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulAddV", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulAddX", 0>; +defm : LMULReadAdvance<"ReadVIMergeV", 0>; +defm : LMULReadAdvance<"ReadVIMergeX", 0>; +defm : LMULReadAdvance<"ReadVIMovV", 0>; +defm : LMULReadAdvance<"ReadVIMovX", 0>; + +// 13. Vector Fixed-Point Arithmetic Instructions +defm "" : LMULReadAdvance<"ReadVSALUV", 0>; +defm "" : LMULReadAdvance<"ReadVSALUX", 0>; +defm "" : LMULReadAdvance<"ReadVAALUV", 0>; +defm "" : LMULReadAdvance<"ReadVAALUX", 0>; +defm "" : LMULReadAdvance<"ReadVSMulV", 0>; +defm "" : LMULReadAdvance<"ReadVSMulX", 0>; +defm "" : LMULReadAdvance<"ReadVSShiftV", 0>; +defm "" : LMULReadAdvance<"ReadVSShiftX", 0>; +defm "" : LMULReadAdvanceW<"ReadVNClipV", 0>; +defm "" : LMULReadAdvanceW<"ReadVNClipX", 0>; + +// 14. Vector Floating-Point Instructions +defm "" : LMULSEWReadAdvanceF<"ReadVFALUV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFALUF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFDivV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFDivF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSqrtV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFRecpV", 0>; +defm "" : LMULReadAdvance<"ReadVFCmpV", 0>; +defm "" : LMULReadAdvance<"ReadVFCmpF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjF", 0>; +defm "" : LMULReadAdvance<"ReadVFClassV", 0>; +defm "" : LMULReadAdvance<"ReadVFMergeV", 0>; +defm "" : LMULReadAdvance<"ReadVFMergeF", 0>; +defm "" : LMULReadAdvance<"ReadVFMovF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFCvtIToFV", 0>; +defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceW<"ReadVFWCvtIToFV", 0>; +defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>; +defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>; + +// 15. Vector Reduction Operations +def : ReadAdvance<ReadVIRedV, 0>; +def : ReadAdvance<ReadVIRedV0, 0>; +def : ReadAdvance<ReadVIWRedV, 0>; +def : ReadAdvance<ReadVIWRedV0, 0>; +def : ReadAdvance<ReadVFRedV, 0>; +def : ReadAdvance<ReadVFRedV0, 0>; +def : ReadAdvance<ReadVFRedOV, 0>; +def : ReadAdvance<ReadVFRedOV0, 0>; +def : ReadAdvance<ReadVFWRedV, 0>; +def : ReadAdvance<ReadVFWRedV0, 0>; +def : ReadAdvance<ReadVFWRedOV, 0>; +def : ReadAdvance<ReadVFWRedOV0, 0>; + +// 16. Vector Mask Instructions +defm "" : LMULReadAdvance<"ReadVMALUV", 0>; +defm "" : LMULReadAdvance<"ReadVMPopV", 0>; +defm "" : LMULReadAdvance<"ReadVMFFSV", 0>; +defm "" : LMULReadAdvance<"ReadVMSFSV", 0>; +defm "" : LMULReadAdvance<"ReadVIotaV", 0>; + +// 17. Vector Permutation Instructions +def : ReadAdvance<ReadVMovXS, 0>; +def : ReadAdvance<ReadVMovSX_V, 0>; +def : ReadAdvance<ReadVMovSX_X, 0>; +def : ReadAdvance<ReadVMovFS, 0>; +def : ReadAdvance<ReadVMovSF_V, 0>; +def : ReadAdvance<ReadVMovSF_F, 0>; +defm "" : LMULReadAdvance<"ReadVISlideV", 0>; +defm "" : LMULReadAdvance<"ReadVISlideX", 0>; +defm "" : LMULReadAdvance<"ReadVFSlideV", 0>; +defm "" : LMULReadAdvance<"ReadVFSlideF", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVCompressV", 0>; +// LMUL Aware +def : ReadAdvance<ReadVMov1V, 0>; +def : ReadAdvance<ReadVMov2V, 0>; +def : ReadAdvance<ReadVMov4V, 0>; +def : ReadAdvance<ReadVMov8V, 0>; + +// Others +def : ReadAdvance<ReadVMask, 0>; +def : ReadAdvance<ReadVPassthru_WorstCase, 0>; +foreach mx = SchedMxList in { + def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>; + foreach sew = SchedSEWSet<mx>.val in + def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx # "_E" # sew), 0>; +} + //===----------------------------------------------------------------------===// // Unsupported extensions defm : UnsupportedSchedQ; -defm : UnsupportedSchedV; defm : UnsupportedSchedZabha; defm : UnsupportedSchedZbkb; defm : UnsupportedSchedZbkx; diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index c2b5e01..e656e8b 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -747,6 +747,14 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { return TwoTimes ? MILog2SEW + 1 : MILog2SEW; } + // Vector Register Gather with 16-bit Index Elements Instruction + // Dest and source data EEW=SEW. Index vector EEW=16. + case RISCV::VRGATHEREI16_VV: { + if (MO.getOperandNo() == 2) + return 4; + return MILog2SEW; + } + default: return std::nullopt; } @@ -1058,6 +1066,11 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VSLIDEDOWN_VI: case RISCV::VSLIDE1UP_VX: case RISCV::VFSLIDE1UP_VF: + // Vector Register Gather Instructions + case RISCV::VRGATHER_VI: + case RISCV::VRGATHER_VV: + case RISCV::VRGATHER_VX: + case RISCV::VRGATHEREI16_VV: // Vector Single-Width Floating-Point Add/Subtract Instructions case RISCV::VFADD_VF: case RISCV::VFADD_VV: diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index 6897865..ea78dcd 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -1364,7 +1364,24 @@ defm : DemangledGetBuiltin<"get_sub_group_gt_mask", OpenCL_std, Variable, Subgro defm : DemangledGetBuiltin<"get_sub_group_le_mask", OpenCL_std, Variable, SubgroupLeMask>; defm : DemangledGetBuiltin<"get_sub_group_lt_mask", OpenCL_std, Variable, SubgroupLtMask>; defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalLinearId", OpenCL_std, Variable, GlobalLinearId>; -defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, Variable, GlobalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationIndex", OpenCL_std, Variable, LocalInvocationIndex>; +defm : DemangledGetBuiltin<"__spirv_BuiltInWorkDim", OpenCL_std, Variable, WorkDim>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupSize", OpenCL_std, Variable, SubgroupSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupMaxSize", OpenCL_std, Variable, SubgroupMaxSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInNumSubgroups", OpenCL_std, Variable, NumSubgroups>; +defm : DemangledGetBuiltin<"__spirv_BuiltInNumEnqueuedSubgroups", OpenCL_std, Variable, NumEnqueuedSubgroups>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupId", OpenCL_std, Variable, SubgroupId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLocalInvocationId", OpenCL_std, Variable, SubgroupLocalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMask", OpenCL_std, Variable, SubgroupEqMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMaskKHR", OpenCL_std, Variable, SubgroupEqMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMask", OpenCL_std, Variable, SubgroupGeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMaskKHR", OpenCL_std, Variable, SubgroupGeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMask", OpenCL_std, Variable, SubgroupGtMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMaskKHR", OpenCL_std, Variable, SubgroupGtMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMask", OpenCL_std, Variable, SubgroupLeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMaskKHR", OpenCL_std, Variable, SubgroupLeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMask", OpenCL_std, Variable, SubgroupLtMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMaskKHR", OpenCL_std, Variable, SubgroupLtMask>; // GetQuery builtin records: defm : DemangledGetBuiltin<"get_local_id", OpenCL_std, GetQuery, LocalInvocationId>; @@ -1375,6 +1392,14 @@ defm : DemangledGetBuiltin<"get_group_id", OpenCL_std, GetQuery, WorkgroupId>; defm : DemangledGetBuiltin<"get_enqueued_local_size", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>; defm : DemangledGetBuiltin<"get_num_groups", OpenCL_std, GetQuery, NumWorkgroups>; defm : DemangledGetBuiltin<"get_global_offset", OpenCL_std, GetQuery, GlobalOffset>; +defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationId", OpenCL_std, GetQuery, LocalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, GetQuery, GlobalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupSize", OpenCL_std, GetQuery, WorkgroupSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalSize", OpenCL_std, GetQuery, GlobalSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupId", OpenCL_std, GetQuery, WorkgroupId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInEnqueuedWorkgroupSize", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInNumWorkgroups", OpenCL_std, GetQuery, NumWorkgroups>; +defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalOffset", OpenCL_std, GetQuery, GlobalOffset>; defm : DemangledGetBuiltin<"__hlsl_wave_get_lane_index", GLSL_std_450, Wave, SubgroupLocalInvocationId>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index fd0bea0..6608b3f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -3120,6 +3120,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract); case Intrinsic::spv_normalize: return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize); + case Intrinsic::spv_refract: + return selectExtInst(ResVReg, ResType, I, GL::Refract); case Intrinsic::spv_reflect: return selectExtInst(ResVReg, ResType, I, GL::Reflect); case Intrinsic::spv_rsqrt: diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index 0b4c63f..82e8ce4e 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -374,5 +374,36 @@ static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } +/// Special handling for i128 and fp128: on x86-32, i128 and fp128 get legalized +/// as four i32s, but fp128 must be passed on the stack with 16-byte alignment. +/// Technically only fp128 has a specified ABI, but it makes sense to handle +/// i128 the same until we hear differently. +static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + assert(ValVT == MVT::i32 && "Should have i32 parts"); + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + assert(PendingMembers.size() == 4 && "Should have two parts"); + + int64_t Offset = State.AllocateStack(16, Align(16)); + PendingMembers[0].convertToMem(Offset); + PendingMembers[1].convertToMem(Offset + 4); + PendingMembers[2].convertToMem(Offset + 8); + PendingMembers[3].convertToMem(Offset + 12); + + State.addLoc(PendingMembers[0]); + State.addLoc(PendingMembers[1]); + State.addLoc(PendingMembers[2]); + State.addLoc(PendingMembers[3]); + PendingMembers.clear(); + return true; +} + // Provides entry points of CC_X86 and RetCC_X86. #include "X86GenCallingConv.inc" diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 823e0caa..f020e0b 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -859,6 +859,11 @@ def CC_X86_32_C : CallingConv<[ // The 'nest' parameter, if any, is passed in ECX. CCIfNest<CCAssignToReg<[ECX]>>, + // i128 and fp128 need to be passed on the stack with a higher alignment than + // their legal types. Handle this with a custom function. + CCIfType<[i32], + CCIfConsecutiveRegs<CCCustom<"CC_X86_32_I128_FP128">>>, + // On swifttailcc pass swiftself in ECX. CCIfCC<"CallingConv::SwiftTail", CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[ECX]>>>>, diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 6bcb7a3..2636979 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1661,7 +1661,7 @@ namespace llvm { /// Lower interleaved load(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 9ad3553..b4639ac 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -237,9 +237,18 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const { - // i128 split into i64 needs to be allocated to two consecutive registers, - // or spilled to the stack as a whole. - return Ty->isIntegerTy(128); + // On x86-64 i128 is split into two i64s and needs to be allocated to two + // consecutive registers, or spilled to the stack as a whole. On x86-32 i128 + // is split to four i32s and never actually passed in registers, but we use + // the consecutive register mark to match it in TableGen. + if (Ty->isIntegerTy(128)) + return true; + + // On x86-32, fp128 acts the same as i128. + if (Subtarget.is32Bit() && Ty->isFP128Ty()) + return true; + + return false; } /// Helper for getByValTypeAlignment to determine diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 1eb47e3..360293bc 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // number of shuffles and ISA. // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -809,6 +809,11 @@ bool X86TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast<LoadInst>(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + // Create an interleaved access group. IRBuilder<> Builder(LI); X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 8fd91fc..78bd5b4 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1855,7 +1855,7 @@ VendorSignatures getVendorSignature(unsigned *MaxLeaf) { #if defined(__i386__) || defined(_M_IX86) || \ defined(__x86_64__) || defined(_M_X64) -const StringMap<bool> sys::getHostCPUFeatures() { +StringMap<bool> sys::getHostCPUFeatures() { unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0; unsigned MaxLevel; StringMap<bool> Features; @@ -2068,7 +2068,7 @@ const StringMap<bool> sys::getHostCPUFeatures() { return Features; } #elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__)) -const StringMap<bool> sys::getHostCPUFeatures() { +StringMap<bool> sys::getHostCPUFeatures() { StringMap<bool> Features; std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent(); if (!P) @@ -2148,7 +2148,7 @@ const StringMap<bool> sys::getHostCPUFeatures() { return Features; } #elif defined(_WIN32) && (defined(__aarch64__) || defined(_M_ARM64)) -const StringMap<bool> sys::getHostCPUFeatures() { +StringMap<bool> sys::getHostCPUFeatures() { StringMap<bool> Features; // If we're asking the OS at runtime, believe what the OS says @@ -2167,7 +2167,7 @@ const StringMap<bool> sys::getHostCPUFeatures() { } #elif defined(__linux__) && defined(__loongarch__) #include <sys/auxv.h> -const StringMap<bool> sys::getHostCPUFeatures() { +StringMap<bool> sys::getHostCPUFeatures() { unsigned long hwcap = getauxval(AT_HWCAP); bool HasFPU = hwcap & (1UL << 3); // HWCAP_LOONGARCH_FPU uint32_t cpucfg2 = 0x2, cpucfg3 = 0x3; @@ -2196,7 +2196,7 @@ const StringMap<bool> sys::getHostCPUFeatures() { return Features; } #elif defined(__linux__) && defined(__riscv) -const StringMap<bool> sys::getHostCPUFeatures() { +StringMap<bool> sys::getHostCPUFeatures() { RISCVHwProbe Query[]{{/*RISCV_HWPROBE_KEY_BASE_BEHAVIOR=*/3, 0}, {/*RISCV_HWPROBE_KEY_IMA_EXT_0=*/4, 0}, {/*RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF=*/9, 0}}; @@ -2279,7 +2279,7 @@ const StringMap<bool> sys::getHostCPUFeatures() { return Features; } #else -const StringMap<bool> sys::getHostCPUFeatures() { return {}; } +StringMap<bool> sys::getHostCPUFeatures() { return {}; } #endif #if __APPLE__ diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index d7e206ef..4ca7444 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -443,6 +443,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["gfx1250-insts"] = true; Features["bitop3-insts"] = true; Features["prng-inst"] = true; + Features["tanh-insts"] = true; Features["transpose-load-f4f6-insts"] = true; Features["bf16-trans-insts"] = true; Features["fp8-conversion-insts"] = true; diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index bcc60c5..be51453 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -64,6 +64,10 @@ StringRef Triple::getArchTypeName(ArchType Kind) { case renderscript64: return "renderscript64"; case riscv32: return "riscv32"; case riscv64: return "riscv64"; + case riscv32be: + return "riscv32be"; + case riscv64be: + return "riscv64be"; case shave: return "shave"; case sparc: return "sparc"; case sparcel: return "sparcel"; @@ -238,7 +242,10 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) { case wasm64: return "wasm"; case riscv32: - case riscv64: return "riscv"; + case riscv64: + case riscv32be: + case riscv64be: + return "riscv"; case ve: return "ve"; case csky: return "csky"; @@ -426,71 +433,73 @@ static Triple::ArchType parseBPFArch(StringRef ArchName) { Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) { Triple::ArchType BPFArch(parseBPFArch(Name)); return StringSwitch<Triple::ArchType>(Name) - .Case("aarch64", aarch64) - .Case("aarch64_be", aarch64_be) - .Case("aarch64_32", aarch64_32) - .Case("arc", arc) - .Case("arm64", aarch64) // "arm64" is an alias for "aarch64" - .Case("arm64_32", aarch64_32) - .Case("arm", arm) - .Case("armeb", armeb) - .Case("avr", avr) - .StartsWith("bpf", BPFArch) - .Case("m68k", m68k) - .Case("mips", mips) - .Case("mipsel", mipsel) - .Case("mips64", mips64) - .Case("mips64el", mips64el) - .Case("msp430", msp430) - .Case("ppc64", ppc64) - .Case("ppc32", ppc) - .Case("ppc", ppc) - .Case("ppc32le", ppcle) - .Case("ppcle", ppcle) - .Case("ppc64le", ppc64le) - .Case("r600", r600) - .Case("amdgcn", amdgcn) - .Case("riscv32", riscv32) - .Case("riscv64", riscv64) - .Case("hexagon", hexagon) - .Case("sparc", sparc) - .Case("sparcel", sparcel) - .Case("sparcv9", sparcv9) - .Case("s390x", systemz) - .Case("systemz", systemz) - .Case("tce", tce) - .Case("tcele", tcele) - .Case("thumb", thumb) - .Case("thumbeb", thumbeb) - .Case("x86", x86) - .Case("i386", x86) - .Case("x86-64", x86_64) - .Case("xcore", xcore) - .Case("nvptx", nvptx) - .Case("nvptx64", nvptx64) - .Case("amdil", amdil) - .Case("amdil64", amdil64) - .Case("hsail", hsail) - .Case("hsail64", hsail64) - .Case("spir", spir) - .Case("spir64", spir64) - .Case("spirv", spirv) - .Case("spirv32", spirv32) - .Case("spirv64", spirv64) - .Case("kalimba", kalimba) - .Case("lanai", lanai) - .Case("shave", shave) - .Case("wasm32", wasm32) - .Case("wasm64", wasm64) - .Case("renderscript32", renderscript32) - .Case("renderscript64", renderscript64) - .Case("ve", ve) - .Case("csky", csky) - .Case("loongarch32", loongarch32) - .Case("loongarch64", loongarch64) - .Case("dxil", dxil) - .Case("xtensa", xtensa) - .Default(UnknownArch); + .Case("aarch64", aarch64) + .Case("aarch64_be", aarch64_be) + .Case("aarch64_32", aarch64_32) + .Case("arc", arc) + .Case("arm64", aarch64) // "arm64" is an alias for "aarch64" + .Case("arm64_32", aarch64_32) + .Case("arm", arm) + .Case("armeb", armeb) + .Case("avr", avr) + .StartsWith("bpf", BPFArch) + .Case("m68k", m68k) + .Case("mips", mips) + .Case("mipsel", mipsel) + .Case("mips64", mips64) + .Case("mips64el", mips64el) + .Case("msp430", msp430) + .Case("ppc64", ppc64) + .Case("ppc32", ppc) + .Case("ppc", ppc) + .Case("ppc32le", ppcle) + .Case("ppcle", ppcle) + .Case("ppc64le", ppc64le) + .Case("r600", r600) + .Case("amdgcn", amdgcn) + .Case("riscv32", riscv32) + .Case("riscv64", riscv64) + .Case("riscv32be", riscv32be) + .Case("riscv64be", riscv64be) + .Case("hexagon", hexagon) + .Case("sparc", sparc) + .Case("sparcel", sparcel) + .Case("sparcv9", sparcv9) + .Case("s390x", systemz) + .Case("systemz", systemz) + .Case("tce", tce) + .Case("tcele", tcele) + .Case("thumb", thumb) + .Case("thumbeb", thumbeb) + .Case("x86", x86) + .Case("i386", x86) + .Case("x86-64", x86_64) + .Case("xcore", xcore) + .Case("nvptx", nvptx) + .Case("nvptx64", nvptx64) + .Case("amdil", amdil) + .Case("amdil64", amdil64) + .Case("hsail", hsail) + .Case("hsail64", hsail64) + .Case("spir", spir) + .Case("spir64", spir64) + .Case("spirv", spirv) + .Case("spirv32", spirv32) + .Case("spirv64", spirv64) + .Case("kalimba", kalimba) + .Case("lanai", lanai) + .Case("shave", shave) + .Case("wasm32", wasm32) + .Case("wasm64", wasm64) + .Case("renderscript32", renderscript32) + .Case("renderscript64", renderscript64) + .Case("ve", ve) + .Case("csky", csky) + .Case("loongarch32", loongarch32) + .Case("loongarch64", loongarch64) + .Case("dxil", dxil) + .Case("xtensa", xtensa) + .Default(UnknownArch); } static Triple::ArchType parseARMArch(StringRef ArchName) { @@ -559,84 +568,85 @@ static Triple::ArchType parseARMArch(StringRef ArchName) { } static Triple::ArchType parseArch(StringRef ArchName) { - auto AT = - StringSwitch<Triple::ArchType>(ArchName) - .Cases("i386", "i486", "i586", "i686", Triple::x86) - // FIXME: Do we need to support these? - .Cases("i786", "i886", "i986", Triple::x86) - .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64) - .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc) - .Cases("powerpcle", "ppcle", "ppc32le", Triple::ppcle) - .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64) - .Cases("powerpc64le", "ppc64le", Triple::ppc64le) - .Case("xscale", Triple::arm) - .Case("xscaleeb", Triple::armeb) - .Case("aarch64", Triple::aarch64) - .Case("aarch64_be", Triple::aarch64_be) - .Case("aarch64_32", Triple::aarch64_32) - .Case("arc", Triple::arc) - .Case("arm64", Triple::aarch64) - .Case("arm64_32", Triple::aarch64_32) - .Case("arm64e", Triple::aarch64) - .Case("arm64ec", Triple::aarch64) - .Case("arm", Triple::arm) - .Case("armeb", Triple::armeb) - .Case("thumb", Triple::thumb) - .Case("thumbeb", Triple::thumbeb) - .Case("avr", Triple::avr) - .Case("m68k", Triple::m68k) - .Case("msp430", Triple::msp430) - .Cases("mips", "mipseb", "mipsallegrex", "mipsisa32r6", "mipsr6", - Triple::mips) - .Cases("mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el", - Triple::mipsel) - .Cases("mips64", "mips64eb", "mipsn32", "mipsisa64r6", "mips64r6", - "mipsn32r6", Triple::mips64) - .Cases("mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el", - "mipsn32r6el", Triple::mips64el) - .Case("r600", Triple::r600) - .Case("amdgcn", Triple::amdgcn) - .Case("riscv32", Triple::riscv32) - .Case("riscv64", Triple::riscv64) - .Case("hexagon", Triple::hexagon) - .Cases("s390x", "systemz", Triple::systemz) - .Case("sparc", Triple::sparc) - .Case("sparcel", Triple::sparcel) - .Cases("sparcv9", "sparc64", Triple::sparcv9) - .Case("tce", Triple::tce) - .Case("tcele", Triple::tcele) - .Case("xcore", Triple::xcore) - .Case("nvptx", Triple::nvptx) - .Case("nvptx64", Triple::nvptx64) - .Case("amdil", Triple::amdil) - .Case("amdil64", Triple::amdil64) - .Case("hsail", Triple::hsail) - .Case("hsail64", Triple::hsail64) - .Case("spir", Triple::spir) - .Case("spir64", Triple::spir64) - .Cases("spirv", "spirv1.5", "spirv1.6", Triple::spirv) - .Cases("spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2", - "spirv32v1.3", "spirv32v1.4", "spirv32v1.5", - "spirv32v1.6", Triple::spirv32) - .Cases("spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2", - "spirv64v1.3", "spirv64v1.4", "spirv64v1.5", - "spirv64v1.6", Triple::spirv64) - .StartsWith("kalimba", Triple::kalimba) - .Case("lanai", Triple::lanai) - .Case("renderscript32", Triple::renderscript32) - .Case("renderscript64", Triple::renderscript64) - .Case("shave", Triple::shave) - .Case("ve", Triple::ve) - .Case("wasm32", Triple::wasm32) - .Case("wasm64", Triple::wasm64) - .Case("csky", Triple::csky) - .Case("loongarch32", Triple::loongarch32) - .Case("loongarch64", Triple::loongarch64) - .Cases("dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3", - "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7", "dxilv1.8", - Triple::dxil) - .Case("xtensa", Triple::xtensa) - .Default(Triple::UnknownArch); + auto AT = StringSwitch<Triple::ArchType>(ArchName) + .Cases("i386", "i486", "i586", "i686", Triple::x86) + // FIXME: Do we need to support these? + .Cases("i786", "i886", "i986", Triple::x86) + .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64) + .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc) + .Cases("powerpcle", "ppcle", "ppc32le", Triple::ppcle) + .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64) + .Cases("powerpc64le", "ppc64le", Triple::ppc64le) + .Case("xscale", Triple::arm) + .Case("xscaleeb", Triple::armeb) + .Case("aarch64", Triple::aarch64) + .Case("aarch64_be", Triple::aarch64_be) + .Case("aarch64_32", Triple::aarch64_32) + .Case("arc", Triple::arc) + .Case("arm64", Triple::aarch64) + .Case("arm64_32", Triple::aarch64_32) + .Case("arm64e", Triple::aarch64) + .Case("arm64ec", Triple::aarch64) + .Case("arm", Triple::arm) + .Case("armeb", Triple::armeb) + .Case("thumb", Triple::thumb) + .Case("thumbeb", Triple::thumbeb) + .Case("avr", Triple::avr) + .Case("m68k", Triple::m68k) + .Case("msp430", Triple::msp430) + .Cases("mips", "mipseb", "mipsallegrex", "mipsisa32r6", + "mipsr6", Triple::mips) + .Cases("mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el", + Triple::mipsel) + .Cases("mips64", "mips64eb", "mipsn32", "mipsisa64r6", + "mips64r6", "mipsn32r6", Triple::mips64) + .Cases("mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el", + "mipsn32r6el", Triple::mips64el) + .Case("r600", Triple::r600) + .Case("amdgcn", Triple::amdgcn) + .Case("riscv32", Triple::riscv32) + .Case("riscv64", Triple::riscv64) + .Case("riscv32be", Triple::riscv32be) + .Case("riscv64be", Triple::riscv64be) + .Case("hexagon", Triple::hexagon) + .Cases("s390x", "systemz", Triple::systemz) + .Case("sparc", Triple::sparc) + .Case("sparcel", Triple::sparcel) + .Cases("sparcv9", "sparc64", Triple::sparcv9) + .Case("tce", Triple::tce) + .Case("tcele", Triple::tcele) + .Case("xcore", Triple::xcore) + .Case("nvptx", Triple::nvptx) + .Case("nvptx64", Triple::nvptx64) + .Case("amdil", Triple::amdil) + .Case("amdil64", Triple::amdil64) + .Case("hsail", Triple::hsail) + .Case("hsail64", Triple::hsail64) + .Case("spir", Triple::spir) + .Case("spir64", Triple::spir64) + .Cases("spirv", "spirv1.5", "spirv1.6", Triple::spirv) + .Cases("spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2", + "spirv32v1.3", "spirv32v1.4", "spirv32v1.5", + "spirv32v1.6", Triple::spirv32) + .Cases("spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2", + "spirv64v1.3", "spirv64v1.4", "spirv64v1.5", + "spirv64v1.6", Triple::spirv64) + .StartsWith("kalimba", Triple::kalimba) + .Case("lanai", Triple::lanai) + .Case("renderscript32", Triple::renderscript32) + .Case("renderscript64", Triple::renderscript64) + .Case("shave", Triple::shave) + .Case("ve", Triple::ve) + .Case("wasm32", Triple::wasm32) + .Case("wasm64", Triple::wasm64) + .Case("csky", Triple::csky) + .Case("loongarch32", Triple::loongarch32) + .Case("loongarch64", Triple::loongarch64) + .Cases("dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3", + "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7", + "dxilv1.8", Triple::dxil) + .Case("xtensa", Triple::xtensa) + .Default(Triple::UnknownArch); // Some architectures require special parsing logic just to compute the // ArchType result. @@ -966,6 +976,8 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) { case Triple::renderscript64: case Triple::riscv32: case Triple::riscv64: + case Triple::riscv32be: + case Triple::riscv64be: case Triple::shave: case Triple::sparc: case Triple::sparcel: @@ -1688,6 +1700,7 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) { case llvm::Triple::r600: case llvm::Triple::renderscript32: case llvm::Triple::riscv32: + case llvm::Triple::riscv32be: case llvm::Triple::shave: case llvm::Triple::sparc: case llvm::Triple::sparcel: @@ -1718,6 +1731,7 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) { case llvm::Triple::ppc64le: case llvm::Triple::renderscript64: case llvm::Triple::riscv64: + case llvm::Triple::riscv64be: case llvm::Triple::sparcv9: case llvm::Triple::spirv: case llvm::Triple::spir64: @@ -1796,6 +1810,7 @@ Triple Triple::get32BitArchVariant() const { case Triple::r600: case Triple::renderscript32: case Triple::riscv32: + case Triple::riscv32be: case Triple::shave: case Triple::sparc: case Triple::sparcel: @@ -1828,6 +1843,9 @@ Triple Triple::get32BitArchVariant() const { case Triple::ppc64le: T.setArch(Triple::ppcle); break; case Triple::renderscript64: T.setArch(Triple::renderscript32); break; case Triple::riscv64: T.setArch(Triple::riscv32); break; + case Triple::riscv64be: + T.setArch(Triple::riscv32be); + break; case Triple::sparcv9: T.setArch(Triple::sparc); break; case Triple::spir64: T.setArch(Triple::spir); break; case Triple::spirv: @@ -1878,6 +1896,7 @@ Triple Triple::get64BitArchVariant() const { case Triple::ppc64le: case Triple::renderscript64: case Triple::riscv64: + case Triple::riscv64be: case Triple::sparcv9: case Triple::spir64: case Triple::spirv64: @@ -1905,6 +1924,9 @@ Triple Triple::get64BitArchVariant() const { case Triple::ppcle: T.setArch(Triple::ppc64le); break; case Triple::renderscript32: T.setArch(Triple::renderscript64); break; case Triple::riscv32: T.setArch(Triple::riscv64); break; + case Triple::riscv32be: + T.setArch(Triple::riscv64be); + break; case Triple::sparc: T.setArch(Triple::sparcv9); break; case Triple::spir: T.setArch(Triple::spir64); break; case Triple::spirv: @@ -1943,8 +1965,6 @@ Triple Triple::getBigEndianArchVariant() const { case Triple::r600: case Triple::renderscript32: case Triple::renderscript64: - case Triple::riscv32: - case Triple::riscv64: case Triple::shave: case Triple::spir64: case Triple::spir: @@ -1977,6 +1997,12 @@ Triple Triple::getBigEndianArchVariant() const { break; case Triple::ppcle: T.setArch(Triple::ppc); break; case Triple::ppc64le: T.setArch(Triple::ppc64); break; + case Triple::riscv32: + T.setArch(Triple::riscv32be); + break; + case Triple::riscv64: + T.setArch(Triple::riscv64be); + break; case Triple::sparcel: T.setArch(Triple::sparc); break; case Triple::tcele: T.setArch(Triple::tce); break; default: @@ -2014,6 +2040,12 @@ Triple Triple::getLittleEndianArchVariant() const { break; case Triple::ppc: T.setArch(Triple::ppcle); break; case Triple::ppc64: T.setArch(Triple::ppc64le); break; + case Triple::riscv32be: + T.setArch(Triple::riscv32); + break; + case Triple::riscv64be: + T.setArch(Triple::riscv64); + break; case Triple::sparc: T.setArch(Triple::sparcel); break; case Triple::tce: T.setArch(Triple::tcele); break; default: diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index fe30c6d..fbeb721 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -1179,6 +1179,13 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { AllocaInst *Alloca = P.Alloca; auto *G = GetFramePointer(Alloca); + // Remove any lifetime intrinsics, now that these are no longer allocas. + for (User *U : make_early_inc_range(Alloca->users())) { + auto *I = cast<Instruction>(U); + if (I->isLifetimeStartOrEnd()) + I->eraseFromParent(); + } + // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G)) // here, as we are changing location of the instruction. G->takeName(Alloca); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 73ba0f7..eb4332f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -878,7 +878,11 @@ static Instruction *foldSetClearBits(SelectInst &Sel, // is a vector consisting of 0 and undefs. If a constant compared with x // is a scalar undefined value or undefined vector then an expression // should be already folded into a constant. -static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) { +// +// This also holds all operations such that Op(0) == 0 +// e.g. Shl, Umin, etc +static Instruction *foldSelectZeroOrFixedOp(SelectInst &SI, + InstCombinerImpl &IC) { auto *CondVal = SI.getCondition(); auto *TrueVal = SI.getTrueValue(); auto *FalseVal = SI.getFalseValue(); @@ -900,10 +904,23 @@ static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) { // non-zero elements that are masked by undef elements in the compare // constant. auto *TrueValC = dyn_cast<Constant>(TrueVal); - if (TrueValC == nullptr || - !match(FalseVal, m_c_Mul(m_Specific(X), m_Value(Y))) || - !isa<Instruction>(FalseVal)) + if (TrueValC == nullptr || !isa<Instruction>(FalseVal)) + return nullptr; + + bool FreezeY; + if (match(FalseVal, m_c_Mul(m_Specific(X), m_Value(Y))) || + match(FalseVal, m_c_And(m_Specific(X), m_Value(Y))) || + match(FalseVal, m_FShl(m_Specific(X), m_Specific(X), m_Value(Y))) || + match(FalseVal, m_FShr(m_Specific(X), m_Specific(X), m_Value(Y))) || + match(FalseVal, + m_c_Intrinsic<Intrinsic::umin>(m_Specific(X), m_Value(Y)))) { + FreezeY = true; + } else if (match(FalseVal, m_IDiv(m_Specific(X), m_Value(Y))) || + match(FalseVal, m_IRem(m_Specific(X), m_Value(Y)))) { + FreezeY = false; + } else { return nullptr; + } auto *ZeroC = cast<Constant>(cast<Instruction>(CondVal)->getOperand(1)); auto *MergedC = Constant::mergeUndefsWith(TrueValC, ZeroC); @@ -914,9 +931,15 @@ static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) { return nullptr; auto *FalseValI = cast<Instruction>(FalseVal); - auto *FrY = IC.InsertNewInstBefore(new FreezeInst(Y, Y->getName() + ".fr"), - FalseValI->getIterator()); - IC.replaceOperand(*FalseValI, FalseValI->getOperand(0) == Y ? 0 : 1, FrY); + if (FreezeY) { + auto *FrY = IC.InsertNewInstBefore(new FreezeInst(Y, Y->getName() + ".fr"), + FalseValI->getIterator()); + IC.replaceOperand(*FalseValI, + FalseValI->getOperand(0) == Y + ? 0 + : (FalseValI->getOperand(1) == Y ? 1 : 2), + FrY); + } return IC.replaceInstUsesWith(SI, FalseValI); } @@ -4104,7 +4127,7 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { return Add; if (Instruction *Or = foldSetClearBits(SI, Builder)) return Or; - if (Instruction *Mul = foldSelectZeroOrMul(SI, *this)) + if (Instruction *Mul = foldSelectZeroOrFixedOp(SI, *this)) return Mul; // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z)) diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp index e7a6fa4..55f3239 100644 --- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp +++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp @@ -10,6 +10,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -184,22 +185,14 @@ void LowerAllowCheckPass::printPipeline( // correctness. // TODO: print shorter output by combining adjacent runs, etc. int i = 0; - bool printed = false; + ListSeparator LS(";"); for (unsigned int cutoff : Opts.cutoffs) { - if (cutoff > 0) { - if (printed) - OS << ";"; - OS << "cutoffs[" << i << "]=" << cutoff; - printed = true; - } - + if (cutoff > 0) + OS << LS << "cutoffs[" << i << "]=" << cutoff; i++; } - if (Opts.runtime_check) { - if (printed) - OS << ";"; - OS << "runtime_check=" << Opts.runtime_check; - } + if (Opts.runtime_check) + OS << LS << "runtime_check=" << Opts.runtime_check; OS << '>'; } diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 739ac00..ed08c0b 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -1223,6 +1223,24 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { return Result; } +Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) { + const Loop *L = S->getLoop(); + BasicBlock *EB = L->getExitBlock(); + if (!EB || !EB->getSinglePredecessor() || + !SE.DT.dominates(EB, Builder.GetInsertBlock())) + return nullptr; + + for (auto &PN : EB->phis()) { + if (!SE.isSCEVable(PN.getType()) || PN.getType() != S->getType()) + continue; + auto *ExitV = SE.getSCEV(&PN); + if (S == ExitV) + return &PN; + } + + return nullptr; +} + Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { // In canonical mode we compute the addrec as an expression of a canonical IV // using evaluateAtIteration and expand the resulting SCEV expression. This @@ -1262,6 +1280,11 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { return V; } + // If S is expanded outside the defining loop, check if there is a + // matching LCSSA phi node for it. + if (Value *V = tryToReuseLCSSAPhi(S)) + return V; + // {X,+,F} --> X + {0,+,F} if (!S->getStart()->isZero()) { if (isa<PointerType>(S->getType())) { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index da6af35..6ad5c601 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -11693,6 +11693,7 @@ void BoUpSLP::transformNodes() { if (StartIdx + VF > End) continue; SmallVector<std::pair<unsigned, unsigned>> Slices; + bool AllStrided = true; for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) { ArrayRef<Value *> Slice = VL.slice(Cnt, VF); // If any instruction is vectorized already - do not try again. @@ -11743,6 +11744,9 @@ void BoUpSLP::transformNodes() { SmallVector<Value *> PointerOps; LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order, PointerOps); + AllStrided &= Res == LoadsState::StridedVectorize || + Res == LoadsState::ScatterVectorize || + Res == LoadsState::Gather; // Do not vectorize gathers. if (Res == LoadsState::ScatterVectorize || Res == LoadsState::Gather) { @@ -11772,6 +11776,11 @@ void BoUpSLP::transformNodes() { } Slices.emplace_back(Cnt, Slice.size()); } + // Do not try to vectorize if all slides are strided or gathered with + // vector factor 2 and there are more than 2 slices. Better to handle + // them in gathered loads analysis, may result in better vectorization. + if (VF == 2 && AllStrided && Slices.size() > 2) + continue; auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) { E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt); if (StartIdx == Cnt) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 703cfe9..204268e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1357,9 +1357,10 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags, public: VPWidenRecipe(unsigned Opcode, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL) + const VPIRFlags &Flags, const VPIRMetadata &Metadata, + DebugLoc DL) : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, Flags, DL), - Opcode(Opcode) {} + VPIRMetadata(Metadata), Opcode(Opcode) {} VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands) : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPIRMetadata(I), @@ -1368,8 +1369,9 @@ public: ~VPWidenRecipe() override = default; VPWidenRecipe *clone() override { - auto *R = new VPWidenRecipe(*getUnderlyingInstr(), operands()); - R->transferFlags(*this); + auto *R = + new VPWidenRecipe(getOpcode(), operands(), *this, *this, getDebugLoc()); + R->setUnderlyingValue(getUnderlyingValue()); return R; } |