diff options
Diffstat (limited to 'llvm/lib/Target')
30 files changed, 380 insertions, 192 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 639ddcb..ecaeff7 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -350,7 +350,7 @@ def AArch64PostLegalizerLowering // Post-legalization combines which are primarily optimizations. def AArch64PostLegalizerCombiner : GICombiner<"AArch64PostLegalizerCombinerImpl", - [copy_prop, cast_of_cast_combines, + [copy_prop, cast_of_cast_combines, constant_fold_fp_ops, buildvector_of_truncate, integer_of_truncate, mutate_anyext_to_zext, combines_for_extload, combine_indexed_load_store, sext_trunc_sextload, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 31b3d18..7294f3e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16249,7 +16249,9 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { bool Negated; uint64_t SplatVal; - if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) { + // NOTE: SRAD cannot be used to represent sdiv-by-one. + if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) && + SplatVal > 1) { SDValue Pg = getPredicateForScalableVector(DAG, DL, VT); SDValue Res = DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0), @@ -30034,7 +30036,9 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE( bool Negated; uint64_t SplatVal; - if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) { + // NOTE: SRAD cannot be used to represent sdiv-by-one. + if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) && + SplatVal > 1) { EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32); @@ -30606,6 +30610,43 @@ AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op, assert(OpVT.isScalableVector() && "Expected scalable vector in LowerVECTOR_DEINTERLEAVE."); + if (Op->getNumOperands() == 3) { + // aarch64_sve_ld3 only supports packed datatypes. + EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount()); + Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment); + + // Write out unmodified operands. + SmallVector<SDValue, 3> Chains; + for (unsigned I = 0; I < 3; ++I) { + SDValue Ptr = + DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL); + SDValue V = getSVESafeBitCast(PackedVT, Op.getOperand(I), DAG); + Chains.push_back( + DAG.getStore(DAG.getEntryNode(), DL, V, Ptr, MachinePointerInfo())); + } + + Intrinsic::ID IntID = Intrinsic::aarch64_sve_ld3_sret; + EVT PredVT = PackedVT.changeVectorElementType(MVT::i1); + + SmallVector<SDValue, 7> Ops; + Ops.push_back(DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains)); + Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64)); + Ops.push_back(DAG.getConstant(1, DL, PredVT)); + Ops.push_back(StackPtr); + + // Read back and deinterleave data. + SDVTList VTs = DAG.getVTList(PackedVT, PackedVT, PackedVT, MVT::Other); + SDValue LD3 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops); + + SmallVector<SDValue, 3> Results; + Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(0), DAG)); + Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(1), DAG)); + Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(2), DAG)); + return DAG.getMergeValues(Results, DL); + } + // Are multi-register uzp instructions available? if (Subtarget->hasSME2() && Subtarget->isStreaming() && OpVT.getVectorElementType() != MVT::i1) { @@ -30647,6 +30688,42 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op, assert(OpVT.isScalableVector() && "Expected scalable vector in LowerVECTOR_INTERLEAVE."); + if (Op->getNumOperands() == 3) { + // aarch64_sve_st3 only supports packed datatypes. + EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount()); + SmallVector<SDValue, 3> InVecs; + for (SDValue V : Op->ops()) + InVecs.push_back(getSVESafeBitCast(PackedVT, V, DAG)); + + Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment); + + Intrinsic::ID IntID = Intrinsic::aarch64_sve_st3; + EVT PredVT = PackedVT.changeVectorElementType(MVT::i1); + + SmallVector<SDValue, 7> Ops; + Ops.push_back(DAG.getEntryNode()); + Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64)); + Ops.append(InVecs); + Ops.push_back(DAG.getConstant(1, DL, PredVT)); + Ops.push_back(StackPtr); + + // Interleave operands and store. + SDValue Chain = DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops); + + // Read back the interleaved data. + SmallVector<SDValue, 3> Results; + for (unsigned I = 0; I < 3; ++I) { + SDValue Ptr = + DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL); + SDValue L = DAG.getLoad(PackedVT, DL, Chain, Ptr, MachinePointerInfo()); + Results.push_back(getSVESafeBitCast(OpVT, L, DAG)); + } + + return DAG.getMergeValues(Results, DL); + } + // Are multi-register zip instructions available? if (Subtarget->hasSME2() && Subtarget->isStreaming() && OpVT.getVectorElementType() != MVT::i1) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 24bef82..8e35ba7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" #include "SIModeRegisterDefaults.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -27,6 +28,7 @@ #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" @@ -106,6 +108,7 @@ public: bool FlowChanged = false; mutable Function *SqrtF32 = nullptr; mutable Function *LdexpF32 = nullptr; + mutable SmallVector<WeakVH> DeadVals; DenseMap<const PHINode *, bool> BreakPhiNodesCache; @@ -242,6 +245,8 @@ public: Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src, FastMathFlags FMF) const; + bool tryNarrowMathIfNoOverflow(Instruction *I); + public: bool visitFDiv(BinaryOperator &I); @@ -281,28 +286,21 @@ bool AMDGPUCodeGenPrepareImpl::run() { BreakPhiNodesCache.clear(); bool MadeChange = false; - Function::iterator NextBB; - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { - BasicBlock *BB = &*FI; - NextBB = std::next(FI); - - BasicBlock::iterator Next; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; - I = Next) { - Next = std::next(I); - - MadeChange |= visit(*I); - - if (Next != E) { // Control flow changed - BasicBlock *NextInstBB = Next->getParent(); - if (NextInstBB != BB) { - BB = NextInstBB; - E = BB->end(); - FE = F.end(); - } - } + // Need to use make_early_inc_range because integer division expansion is + // handled by Transform/Utils, and it can delete instructions such as the + // terminator of the BB. + for (BasicBlock &BB : reverse(F)) { + for (Instruction &I : make_early_inc_range(reverse(BB))) { + if (!isInstructionTriviallyDead(&I, TLI)) + MadeChange |= visit(I); } } + + while (!DeadVals.empty()) { + if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + } + return MadeChange; } @@ -422,7 +420,7 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { Value *NewVal = insertValues(Builder, Ty, ResultVals); NewVal->takeName(&I); I.replaceAllUsesWith(NewVal); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -496,10 +494,10 @@ bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const { FoldedT, FoldedF); NewSelect->takeName(&BO); BO.replaceAllUsesWith(NewSelect); - BO.eraseFromParent(); + DeadVals.push_back(&BO); if (CastOp) - CastOp->eraseFromParent(); - Sel->eraseFromParent(); + DeadVals.push_back(CastOp); + DeadVals.push_back(Sel); return true; } @@ -895,7 +893,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { if (NewVal) { FDiv.replaceAllUsesWith(NewVal); NewVal->takeName(&FDiv); - RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLI); + DeadVals.push_back(&FDiv); } return true; @@ -1302,10 +1300,7 @@ it will create `s_and_b32 s0, s0, 0xff`. We accept this change since the non-byte load assumes the upper bits within the byte are all 0. */ -static bool tryNarrowMathIfNoOverflow(Instruction *I, - const SITargetLowering *TLI, - const TargetTransformInfo &TTI, - const DataLayout &DL) { +bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) { unsigned Opc = I->getOpcode(); Type *OldType = I->getType(); @@ -1330,6 +1325,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I, NewType = I->getType()->getWithNewBitWidth(NewBit); // Old cost + const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F); InstructionCost OldCost = TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput); // New cost of new op @@ -1360,7 +1356,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I, Value *Zext = Builder.CreateZExt(Arith, OldType); I->replaceAllUsesWith(Zext); - I->eraseFromParent(); + DeadVals.push_back(I); return true; } @@ -1370,8 +1366,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { if (UseMul24Intrin && replaceMulWithMul24(I)) return true; - if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(), - TM.getTargetTransformInfo(F), DL)) + if (tryNarrowMathIfNoOverflow(&I)) return true; bool Changed = false; @@ -1436,7 +1431,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { if (NewDiv) { I.replaceAllUsesWith(NewDiv); - I.eraseFromParent(); + DeadVals.push_back(&I); Changed = true; } } @@ -1492,7 +1487,7 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) { Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); I.replaceAllUsesWith(ValOrig); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -1534,7 +1529,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) { Fract->takeName(&I); I.replaceAllUsesWith(Fract); - RecursivelyDeleteTriviallyDeadInstructions(&I, TLI); + DeadVals.push_back(&I); return true; } @@ -1822,7 +1817,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) { } I.replaceAllUsesWith(Vec); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -1903,7 +1898,7 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { auto *Intrin = B.CreateIntrinsic( I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)}); I.replaceAllUsesWith(Intrin); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -2000,16 +1995,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { Value *Fract = applyFractPat(Builder, FractArg); Fract->takeName(&I); I.replaceAllUsesWith(Fract); - - RecursivelyDeleteTriviallyDeadInstructions(&I, TLI); + DeadVals.push_back(&I); return true; } -static bool isOneOrNegOne(const Value *Val) { - const APFloat *C; - return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0; -} - // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way. bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { Type *Ty = Sqrt.getType()->getScalarType(); @@ -2030,18 +2019,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { if (ReqdAccuracy < 1.0f) return false; - // FIXME: This is an ugly hack for this pass using forward iteration instead - // of reverse. If it worked like a normal combiner, the rsq would form before - // we saw a sqrt call. - auto *FDiv = - dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser()); - if (FDiv && FDiv->getOpcode() == Instruction::FDiv && - FDiv->getFPAccuracy() >= 1.0f && - canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) && - // TODO: We should also handle the arcp case for the fdiv with non-1 value - isOneOrNegOne(FDiv->getOperand(0))) - return false; - Value *SrcVal = Sqrt.getOperand(0); bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt); @@ -2065,7 +2042,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals); NewSqrt->takeName(&Sqrt); Sqrt.replaceAllUsesWith(NewSqrt); - Sqrt.eraseFromParent(); + DeadVals.push_back(&Sqrt); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 73b2660..5407566 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -468,6 +468,38 @@ void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) { MI.eraseFromParent(); } +void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) { + Register Lo, Hi; + switch (MI.getOpcode()) { + case AMDGPU::G_SMIN: + case AMDGPU::G_SMAX: { + // For signed operations, use sign extension + auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg()); + auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg()); + Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo}) + .getReg(0); + Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi}) + .getReg(0); + break; + } + case AMDGPU::G_UMIN: + case AMDGPU::G_UMAX: { + // For unsigned operations, use zero extension + auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg()); + auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg()); + Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo}) + .getReg(0); + Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi}) + .getReg(0); + break; + } + default: + llvm_unreachable("Unpack min/max lowering not implemented"); + } + B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi}); + MI.eraseFromParent(); +} + static bool isSignedBFE(MachineInstr &MI) { if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI)) return (GI->is(Intrinsic::amdgcn_sbfe)); @@ -654,6 +686,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, } case UnpackBitShift: return lowerUnpackBitShift(MI); + case UnpackMinMax: + return lowerUnpackMinMax(MI); case Ext32To64: { const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); MachineInstrBuilder Hi; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index 7affe5a..d937815 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -123,6 +123,7 @@ private: void lowerSplitTo32(MachineInstr &MI); void lowerSplitTo32Select(MachineInstr &MI); void lowerSplitTo32SExtInReg(MachineInstr &MI); + void lowerUnpackMinMax(MachineInstr &MI); }; } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index f413bbc..bfe2c80 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -522,6 +522,22 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32, Sgpr32}, S_BFE}) .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32, Vgpr32}, V_BFE}); + addRulesForGOpcs({G_SMIN, G_SMAX}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32SExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackMinMax}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}); + + addRulesForGOpcs({G_UMIN, G_UMAX}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackMinMax}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}); + // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT // and G_FREEZE here, rest is trivially regbankselected earlier addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}}); @@ -617,6 +633,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}}) .Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SplitTo32SExtInReg}}); + addRulesForGOpcs({G_ASSERT_ZEXT, G_ASSERT_SEXT}, Standard) + .Uni(S32, {{Sgpr32}, {Sgpr32, Imm}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Imm}}) + .Uni(S64, {{Sgpr64}, {Sgpr64, Imm}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Imm}}); + bool hasSMRDx3 = ST->hasScalarDwordx3Loads(); bool hasSMRDSmall = ST->hasScalarSubwordLoads(); bool usesTrue16 = ST->useRealTrue16Insts(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index d0c6910..93e0efd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -212,6 +212,7 @@ enum LoweringMethodID { VccExtToSel, UniExtToSel, UnpackBitShift, + UnpackMinMax, S_BFE, V_BFE, VgprToVccCopy, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 557d87f..56807a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5053,16 +5053,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // // vdst, srcA, srcB, srcC const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + bool UseAGPRForm = !Subtarget.hasGFX90AInsts() || + Info->selectAGPRFormMFMA(MinNumRegsRequired); + OpdsMapping[0] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4: @@ -5115,11 +5117,21 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8: case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8: case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: { + Register DstReg = MI.getOperand(0).getReg(); + unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); + unsigned MinNumRegsRequired = DstSize / 32; + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired); + // vdst, srcA, srcB, srcC, idx - OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = UseAGPRForm ? getAGPROpMapping(DstReg, MRI, *TRI) + : getVGPROpMapping(DstReg, MRI, *TRI); + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); - OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[4] = + UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); break; } diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index ef63acc..71494be 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -905,7 +905,7 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { OS << ":\n"; SlotIndex MBBStartSlot = LIS.getSlotIndexes()->getMBBStartIdx(&MBB); - SlotIndex MBBEndSlot = LIS.getSlotIndexes()->getMBBEndIdx(&MBB); + SlotIndex MBBLastSlot = LIS.getSlotIndexes()->getMBBLastIdx(&MBB); GCNRPTracker::LiveRegSet LiveIn, LiveOut; GCNRegPressure RPAtMBBEnd; @@ -931,7 +931,7 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { } } else { GCNUpwardRPTracker RPT(LIS); - RPT.reset(MRI, MBBEndSlot); + RPT.reset(MRI, MBBLastSlot); LiveOut = RPT.getLiveRegs(); RPAtMBBEnd = RPT.getPressure(); @@ -966,14 +966,14 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { OS << PFX " Live-out:" << llvm::print(LiveOut, MRI); if (UseDownwardTracker) - ReportLISMismatchIfAny(LiveOut, getLiveRegs(MBBEndSlot, LIS, MRI)); + ReportLISMismatchIfAny(LiveOut, getLiveRegs(MBBLastSlot, LIS, MRI)); GCNRPTracker::LiveRegSet LiveThrough; for (auto [Reg, Mask] : LiveIn) { LaneBitmask MaskIntersection = Mask & LiveOut.lookup(Reg); if (MaskIntersection.any()) { LaneBitmask LTMask = getRegLiveThroughMask( - MRI, LIS, Reg, MBBStartSlot, MBBEndSlot, MaskIntersection); + MRI, LIS, Reg, MBBStartSlot, MBBLastSlot, MaskIntersection); if (LTMask.any()) LiveThrough[Reg] = LTMask; } diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index a9c58bb..898d1ff 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -313,8 +313,8 @@ public: /// reset tracker to the end of the \p MBB. void reset(const MachineBasicBlock &MBB) { - reset(MBB.getParent()->getRegInfo(), - LIS.getSlotIndexes()->getMBBEndIdx(&MBB)); + SlotIndex MBBLastSlot = LIS.getSlotIndexes()->getMBBLastIdx(&MBB); + reset(MBB.getParent()->getRegInfo(), MBBLastSlot); } /// reset tracker to the point just after \p MI (in program order). diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 730be69..80e985d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -103,52 +103,52 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::Untyped, V64RegClass); addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); - addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96)); + addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); - addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128)); + addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); - addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160)); + addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192)); + addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass); addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192)); + addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass); addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); - addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224)); + addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass); addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256)); + addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256)); + addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass); addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass); - addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288)); + addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass); addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass); - addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320)); + addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass); addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass); - addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352)); + addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass); addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass); - addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384)); + addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass); addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512)); + addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512)); + addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass); addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); - addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024)); + addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass); if (Subtarget->has16BitInsts()) { if (Subtarget->useRealTrue16Insts()) { @@ -180,7 +180,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); - addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024)); + addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); computeRegisterProperties(Subtarget->getRegisterInfo()); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index b7dbb59..2c1a13c 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -1202,6 +1202,12 @@ public: unsigned getMinNumAGPRs() const { return MinNumAGPRs; } + /// Return true if an MFMA that requires at least \p NumRegs should select to + /// the AGPR form, instead of the VGPR form. + bool selectAGPRFormMFMA(unsigned NumRegs) const { + return !MFMAVGPRForm && getMinNumAGPRs() >= NumRegs; + } + // \returns true if a function has a use of AGPRs via inline asm or // has a call which may use it. bool mayUseAGPRs(const Function &F) const; diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 4d3331a..c684f9e 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -674,15 +674,9 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true); MachineOperand HiDstOp = Op0HOp1H->getOperand(0); - if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { - Op0LOp1L->setFlag(MachineInstr::MIFlag::NoFPExcept); - Op0HOp1H->setFlag(MachineInstr::MIFlag::NoFPExcept); - } - if (I.getFlag(MachineInstr::MIFlag::FmContract)) { - Op0LOp1L->setFlag(MachineInstr::MIFlag::FmContract); - Op0HOp1H->setFlag(MachineInstr::MIFlag::FmContract); - } - + uint32_t IFlags = I.getFlags(); + Op0LOp1L->setFlags(IFlags); + Op0HOp1H->setFlags(IFlags); LoDstOp.setIsRenamable(DstOp.isRenamable()); HiDstOp.setIsRenamable(DstOp.isRenamable()); diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 7cfd059..6500fce 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -964,14 +964,12 @@ class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : Pa class CanUseAGPR_MAI<ValueType vt> { code PredicateCode = [{ return !Subtarget->hasGFX90AInsts() || - (!SIMachineFunctionInfo::MFMAVGPRForm && - MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= - }] # !srl(vt.Size, 5) # ");"; + MF->getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA( + }] # !srl(vt.Size, 5) # ");"; code GISelPredicateCode = [{ return !Subtarget->hasGFX90AInsts() || - (!SIMachineFunctionInfo::MFMAVGPRForm && - MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= + MF.getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA( }] # !srl(vt.Size, 5) # ");"; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 2a40fb9..83c7def 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -42,7 +42,6 @@ #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/ComplexDeinterleavingPass.h" #include "llvm/CodeGen/ISDOpcodes.h" -#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index a0acfcf..85ce944 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -699,35 +699,20 @@ def: OpR_RR_pat<C2_cmpgtp, setgt, i1, I64>; def: OpR_RR_pat<C2_cmpgtup, setugt, i1, I64>; def: OpR_RR_pat<C2_cmpgtp, RevCmp<setlt>, i1, I64>; def: OpR_RR_pat<C2_cmpgtup, RevCmp<setult>, i1, I64>; -def: OpR_RR_pat<A2_vcmpbeq, seteq, i1, V8I8>; def: OpR_RR_pat<A2_vcmpbeq, seteq, v8i1, V8I8>; -def: OpR_RR_pat<A4_vcmpbgt, RevCmp<setlt>, i1, V8I8>; def: OpR_RR_pat<A4_vcmpbgt, RevCmp<setlt>, v8i1, V8I8>; -def: OpR_RR_pat<A4_vcmpbgt, setgt, i1, V8I8>; def: OpR_RR_pat<A4_vcmpbgt, setgt, v8i1, V8I8>; -def: OpR_RR_pat<A2_vcmpbgtu, RevCmp<setult>, i1, V8I8>; def: OpR_RR_pat<A2_vcmpbgtu, RevCmp<setult>, v8i1, V8I8>; -def: OpR_RR_pat<A2_vcmpbgtu, setugt, i1, V8I8>; def: OpR_RR_pat<A2_vcmpbgtu, setugt, v8i1, V8I8>; -def: OpR_RR_pat<A2_vcmpheq, seteq, i1, V4I16>; def: OpR_RR_pat<A2_vcmpheq, seteq, v4i1, V4I16>; -def: OpR_RR_pat<A2_vcmphgt, RevCmp<setlt>, i1, V4I16>; def: OpR_RR_pat<A2_vcmphgt, RevCmp<setlt>, v4i1, V4I16>; -def: OpR_RR_pat<A2_vcmphgt, setgt, i1, V4I16>; def: OpR_RR_pat<A2_vcmphgt, setgt, v4i1, V4I16>; -def: OpR_RR_pat<A2_vcmphgtu, RevCmp<setult>, i1, V4I16>; def: OpR_RR_pat<A2_vcmphgtu, RevCmp<setult>, v4i1, V4I16>; -def: OpR_RR_pat<A2_vcmphgtu, setugt, i1, V4I16>; def: OpR_RR_pat<A2_vcmphgtu, setugt, v4i1, V4I16>; -def: OpR_RR_pat<A2_vcmpweq, seteq, i1, V2I32>; def: OpR_RR_pat<A2_vcmpweq, seteq, v2i1, V2I32>; -def: OpR_RR_pat<A2_vcmpwgt, RevCmp<setlt>, i1, V2I32>; def: OpR_RR_pat<A2_vcmpwgt, RevCmp<setlt>, v2i1, V2I32>; -def: OpR_RR_pat<A2_vcmpwgt, setgt, i1, V2I32>; def: OpR_RR_pat<A2_vcmpwgt, setgt, v2i1, V2I32>; -def: OpR_RR_pat<A2_vcmpwgtu, RevCmp<setult>, i1, V2I32>; def: OpR_RR_pat<A2_vcmpwgtu, RevCmp<setult>, v2i1, V2I32>; -def: OpR_RR_pat<A2_vcmpwgtu, setugt, i1, V2I32>; def: OpR_RR_pat<A2_vcmpwgtu, setugt, v2i1, V2I32>; def: OpR_RR_pat<F2_sfcmpeq, seteq, i1, F32>; @@ -1213,12 +1198,6 @@ def: OpR_RI_pat<S2_asl_i_r, Shl, i32, I32, u5_0ImmPred>; def: OpR_RI_pat<S2_asr_i_p, Sra, i64, I64, u6_0ImmPred>; def: OpR_RI_pat<S2_lsr_i_p, Srl, i64, I64, u6_0ImmPred>; def: OpR_RI_pat<S2_asl_i_p, Shl, i64, I64, u6_0ImmPred>; -def: OpR_RI_pat<S2_asr_i_vh, Sra, v4i16, V4I16, u4_0ImmPred>; -def: OpR_RI_pat<S2_lsr_i_vh, Srl, v4i16, V4I16, u4_0ImmPred>; -def: OpR_RI_pat<S2_asl_i_vh, Shl, v4i16, V4I16, u4_0ImmPred>; -def: OpR_RI_pat<S2_asr_i_vh, Sra, v2i32, V2I32, u5_0ImmPred>; -def: OpR_RI_pat<S2_lsr_i_vh, Srl, v2i32, V2I32, u5_0ImmPred>; -def: OpR_RI_pat<S2_asl_i_vh, Shl, v2i32, V2I32, u5_0ImmPred>; def: OpR_RR_pat<S2_asr_r_r, Sra, i32, I32, I32>; def: OpR_RR_pat<S2_lsr_r_r, Srl, i32, I32, I32>; diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index ba70c9e..97379d7 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -3677,7 +3677,7 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, Out, STI)) return true; - if (IsLikely) { + if (IsLikely && MemOffsetOp.isExpr()) { TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI); TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI); diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td index eff80e5..21d8ded 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.td +++ b/llvm/lib/Target/Mips/MipsInstrInfo.td @@ -855,6 +855,16 @@ def calltarget : Operand<iPTR> { def imm64: Operand<i64>; +def ConstantImmAsmOperandClass : AsmOperandClass { + let Name = "ConstantImm"; + let PredicateMethod = "isConstantImm"; + let RenderMethod = "addImmOperands"; +} + +def ConstantImm64: Operand<i64> { + let ParserMatchClass = ConstantImmAsmOperandClass; +} + def simm19_lsl2 : Operand<i32> { let EncoderMethod = "getSimm19Lsl2Encoding"; let DecoderMethod = "DecodeSimm19Lsl2"; @@ -2947,10 +2957,10 @@ def : MipsInstAlias<"nor\t$rs, $imm", (NORImm GPR32Opnd:$rs, GPR32Opnd:$rs, let hasDelaySlot = 1, isCTI = 1 in { def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), - (ins imm64:$imm64, brtarget:$offset), + (ins ConstantImm64:$imm64, brtarget:$offset), "bne\t$rt, $imm64, $offset">; def BeqImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), - (ins imm64:$imm64, brtarget:$offset), + (ins ConstantImm64:$imm64, brtarget:$offset), "beq\t$rt, $imm64, $offset">; class CondBranchPseudo<string instr_asm> : @@ -2978,7 +2988,7 @@ def BGTUL: CondBranchPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6; let isCTI = 1 in class CondBranchImmPseudo<string instr_asm> : - MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset), + MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, ConstantImm64:$imm, brtarget:$offset), !strconcat(instr_asm, "\t$rs, $imm, $offset")>; def BEQLImmMacro : CondBranchImmPseudo<"beql">, ISA_MIPS2_NOT_32R6_64R6; diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 40c05e8..5ceb477 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1520,6 +1520,8 @@ def HasVendorXqcics : Predicate<"Subtarget->hasVendorXqcics()">, AssemblerPredicate<(all_of FeatureVendorXqcics), "'Xqcics' (Qualcomm uC Conditional Select Extension)">; +def NoVendorXqcics + : Predicate<"!Subtarget->hasVendorXqcics()">; def FeatureVendorXqcicsr : RISCVExperimentalExtension<0, 4, "Qualcomm uC CSR Extension">; @@ -1823,6 +1825,11 @@ def TuneConditionalCompressedMoveFusion def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()">; def NoConditionalMoveFusion : Predicate<"!Subtarget->hasConditionalMoveFusion()">; +def TuneHasSingleElementVecFP64 + : SubtargetFeature<"single-element-vec-fp64", "HasSingleElementVectorFP64", "true", + "Certain vector FP64 operations produce a single result " + "element per cycle">; + def TuneMIPSP8700 : SubtargetFeature<"mips-p8700", "RISCVProcFamily", "MIPSP8700", "MIPS p8700 processor">; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index f2724c41..5e1d07a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -1571,35 +1571,42 @@ def : QCIMVCCIPat<SETUGE, QC_MVGEUI, uimm5nonzero>; } let Predicates = [HasVendorXqcicli, IsRV32] in { -def : QCILICCPat<SETEQ, QC_LIEQ>; -def : QCILICCPat<SETNE, QC_LINE>; def : QCILICCPat<SETLT, QC_LILT>; def : QCILICCPat<SETGE, QC_LIGE>; def : QCILICCPat<SETULT, QC_LILTU>; def : QCILICCPat<SETUGE, QC_LIGEU>; -def : QCILICCIPat<SETEQ, QC_LIEQI, simm5>; -def : QCILICCIPat<SETNE, QC_LINEI, simm5>; def : QCILICCIPat<SETLT, QC_LILTI, simm5>; def : QCILICCIPat<SETGE, QC_LIGEI, simm5>; def : QCILICCIPat<SETULT, QC_LILTUI, uimm5>; def : QCILICCIPat<SETUGE, QC_LIGEUI, uimm5>; -def : QCILICCPatInv<SETNE, QC_LIEQ>; -def : QCILICCPatInv<SETEQ, QC_LINE>; def : QCILICCPatInv<SETGE, QC_LILT>; def : QCILICCPatInv<SETLT, QC_LIGE>; def : QCILICCPatInv<SETUGE, QC_LILTU>; def : QCILICCPatInv<SETULT, QC_LIGEU>; -def : QCILICCIPatInv<SETNE, QC_LIEQI, simm5>; -def : QCILICCIPatInv<SETEQ, QC_LINEI, simm5>; def : QCILICCIPatInv<SETGE, QC_LILTI, simm5>; def : QCILICCIPatInv<SETLT, QC_LIGEI, simm5>; def : QCILICCIPatInv<SETUGE, QC_LILTUI, uimm5>; def : QCILICCIPatInv<SETULT, QC_LIGEUI, uimm5>; } // Predicates = [HasVendorXqcicli, IsRV32] +// Prioritize Xqcics over these patterns. +let Predicates = [HasVendorXqcicli, NoVendorXqcics, IsRV32] in { +def : QCILICCPat<SETEQ, QC_LIEQ>; +def : QCILICCPat<SETNE, QC_LINE>; + +def : QCILICCIPat<SETEQ, QC_LIEQI, simm5>; +def : QCILICCIPat<SETNE, QC_LINEI, simm5>; + +def : QCILICCPatInv<SETNE, QC_LIEQ>; +def : QCILICCPatInv<SETEQ, QC_LINE>; + +def : QCILICCIPatInv<SETNE, QC_LIEQI, simm5>; +def : QCILICCIPatInv<SETEQ, QC_LINEI, simm5>; +} // Predicates = [HasVendorXqcicli, NoVendorXqcics, IsRV32] + let Predicates = [HasVendorXqcics, IsRV32] in { // (SELECT X, Y, Z) is canonicalised to `(riscv_selectcc x, 0, NE, y, z)`. // These exist to prioritise over the `Select_GPR_Using_CC_GPR` pattern. diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td index 6d86aff..3658817 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td +++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td @@ -14,6 +14,10 @@ // otherwise. def VLDSX0Pred : MCSchedPredicate<CheckRegOperand<3, X0>>; +// This scheduling predicate is true when subtarget feature TuneHasSingleElementVecFP64 +// is enabled. +def SingleElementVecFP64SchedPred : FeatureSchedPredicate<TuneHasSingleElementVecFP64>; + // Returns true if this is the sext.w pattern, addiw rd, rs1, 0. def isSEXT_W : TIIPredicate<"isSEXT_W", diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 17a7948..e86431f 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -338,7 +338,8 @@ def SIFIVE_X390 : RISCVProcessorModel<"sifive-x390", FeatureStdExtZvl1024b, FeatureVendorXSiFivecdiscarddlone, FeatureVendorXSiFivecflushdlone], - SiFiveIntelligenceTuneFeatures>; + !listconcat(SiFiveIntelligenceTuneFeatures, + [TuneHasSingleElementVecFP64])>; defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll, TuneConditionalCompressedMoveFusion, diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 3e07eff..f863392a 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -317,7 +317,6 @@ multiclass SiFive7WriteResBase<int VLEN, ProcResourceKind VL, ProcResourceKind VS, ProcResourceKind VCQ, SiFive7FPLatencies fpLatencies, - bit isFP64Throttled = false, bit hasFastGather = false> { // Branching @@ -832,29 +831,56 @@ multiclass SiFive7WriteResBase<int VLEN, // 13. Vector Floating-Point Instructions foreach mx = SchedMxListF in { foreach sew = SchedSEWSet<mx, isF=1>.val in { - defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 64)), - SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, - SiFive7GetCyclesDefault<mx>.c); - defvar Lat8 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 8); - defvar VA = !if(!and(isFP64Throttled, !eq(sew, 64)), VA1, VA1OrVA2); defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; - let Latency = Lat8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { - defm : LMULSEWWriteResMXSEW<"WriteVFALUV", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFALUF", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMulV", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMulF", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFRecpV", [VCQ, VA1], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; - } - defvar Lat4 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 4); - let Latency = Lat4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { - defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [VCQ, VA], mx, sew, IsWorstCase>; - // min max require merge - defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>; + if !eq(sew, 64) then { + defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c; + foreach SchedWriteName = ["WriteVFALUV", "WriteVFALUF", "WriteVFMulV", "WriteVFMulF", + "WriteVFMulAddV", "WriteVFMulAddF"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1OrVA2], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)], + mx, sew, IsWorstCase>; + foreach SchedWriteName = ["WriteVFRecpV", "WriteVFCvtIToFV"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)], + mx, sew, IsWorstCase>; + foreach SchedWriteName = ["WriteVFSgnjV", "WriteVFSgnjF"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1OrVA2], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)], + mx, sew, IsWorstCase>; + foreach SchedWriteName = ["WriteVFMinMaxV", "WriteVFMinMaxF"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)], + mx, sew, IsWorstCase>; + } else { + let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in { + defm : LMULSEWWriteResMXSEW<"WriteVFALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMulF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFRecpV", [VCQ, VA1], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + } + let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in { + defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + // min max require merge + defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>; + } } } } @@ -892,19 +918,28 @@ multiclass SiFive7WriteResBase<int VLEN, // Widening foreach mx = SchedMxListW in { foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in { - defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)), - SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, - SiFive7GetCyclesDefault<mx>.c); defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c; - let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in - defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c; + if !eq(sew, 32) then { + defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c; + defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtIToFV", SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)], + mx, sew, IsWorstCase>; + } else { + let Latency = 8, + AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in + defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + } } } foreach mx = SchedMxListFW in { foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { - defvar Cycles = SiFive7GetCyclesDefault<mx>.c; + defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c; defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; - let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { + let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in { defm : LMULSEWWriteResMXSEW<"WriteVFWALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; defm : LMULSEWWriteResMXSEW<"WriteVFWALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; defm : LMULSEWWriteResMXSEW<"WriteVFWMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; @@ -912,11 +947,19 @@ multiclass SiFive7WriteResBase<int VLEN, defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; } - defvar CvtCycles = !if(!and(isFP64Throttled, !eq(sew, 32)), - SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, - SiFive7GetCyclesDefault<mx>.c); - let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, CvtCycles)] in - defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + if !eq(sew, 32) then { + defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c; + defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtFToFV", SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)], + mx, sew, IsWorstCase>; + } else { + let Latency = 8, + AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in + defm : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + } } defvar Cycles = SiFive7GetCyclesDefault<mx>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListFW>.c; @@ -933,13 +976,23 @@ multiclass SiFive7WriteResBase<int VLEN, } foreach mx = SchedMxListFW in { foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { - defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)), - SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, - SiFive7GetCyclesNarrowing<mx>.c); defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; - let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { - defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + defvar DefaultCycles = SiFive7GetCyclesNarrowing<mx>.c; + if !eq(sew, 32) then { + defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c; + foreach SchedWriteName = ["WriteVFNCvtIToFV", "WriteVFNCvtFToFV"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)], + mx, sew, IsWorstCase>; + } else { + let Latency = 8, + AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in { + defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + } } } } @@ -1499,7 +1552,6 @@ multiclass SiFive7ReadAdvance { /// eventually be supplied by different SchedMachineModels. multiclass SiFive7SchedResources<int vlen, bit extraVALU, SiFive7FPLatencies fpLatencies, - bit isFP64Throttled, bit hasFastGather> { defm SiFive7 : SiFive7ProcResources<extraVALU>; @@ -1527,8 +1579,7 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU, : SiFive7WriteResBase<vlen, SiFive7PipeA, SiFive7PipeB, SiFive7PipeAB, SiFive7IDiv, SiFive7FDiv, SiFive7VA1, SiFive7VA1OrVA2, SiFive7VL, SiFive7VS, - SiFive7VCQ, fpLatencies, isFP64Throttled, - hasFastGather>; + SiFive7VCQ, fpLatencies, hasFastGather>; //===----------------------------------------------------------------------===// // Bypass and advance @@ -1560,7 +1611,6 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel { bit HasExtraVALU = false; SiFive7FPLatencies FPLatencies; - bit IsFP64Throttled = false; bit HasFastGather = false; string Name = !subst("Model", "", !subst("SiFive7", "", NAME)); @@ -1587,7 +1637,6 @@ def SiFive7VLEN512Model : SiFive7SchedMachineModel<512> { def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> { let HasExtraVALU = true; let FPLatencies = SiFive7LowFPLatencies; - let IsFP64Throttled = true; let HasFastGather = true; } @@ -1596,7 +1645,6 @@ foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in { let SchedModel = model in defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU, model.FPLatencies, - model.IsFP64Throttled, model.HasFastGather>; } diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td index 01a4308..d11b446 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleV.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td @@ -128,6 +128,22 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred, IsWorstCase>; } +multiclass LMULSEWWriteResMXSEWVariant<string name, SchedPredicateBase Pred, + list<ProcResourceKind> predResources, + int predLat, list<int> predAcquireCycles, + list<int> predReleaseCycles, + list<ProcResourceKind> noPredResources, + int noPredLat, list<int> noPredAcquireCycles, + list<int> noPredReleaseCycles, + string mx, int sew, bit IsWorstCase> { + defm "" : LMULWriteResVariantImpl<name, name # "_" # mx # "_E" # sew, Pred, predResources, + predLat, predAcquireCycles, + predReleaseCycles, noPredResources, + noPredLat, noPredAcquireCycles, + noPredReleaseCycles, + IsWorstCase>; +} + // Define multiclasses to define SchedWrite, SchedRead, WriteRes, and // ReadAdvance for each (name, LMUL) pair and for each LMUL in each of the // SchedMxList variants above. Each multiclass is responsible for defining diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index e8c849e..28a1690 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -46,7 +46,6 @@ #include "SPIRVSubtarget.h" #include "SPIRVTargetMachine.h" #include "SPIRVUtils.h" -#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp index 20f03b0..60d39c9 100644 --- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp @@ -19,7 +19,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Intrinsics.h" diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp index 278ad7c..e621bcd44 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp @@ -14,7 +14,6 @@ #include "SPIRV.h" #include "SPIRVSubtarget.h" #include "SPIRVUtils.h" -#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Transforms/Utils/Cloning.h" diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp index 1811492..5b149f8 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp @@ -16,7 +16,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9580ade..1cfcb1f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -28,7 +28,6 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/VectorUtils.h" -#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 3bc46af..6dd43b2 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -547,7 +547,7 @@ unsigned X86TargetLowering::getAddressSpace() const { static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || - (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); + TargetTriple.isAndroid(); } static Constant* SegmentOffset(IRBuilderBase &IRB, |