diff options
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrFormats.td | 16 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 17 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 38 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 21 | ||||
| -rw-r--r-- | llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 71 | ||||
| -rw-r--r-- | llvm/lib/Target/PowerPC/PPCISelLowering.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 43 | ||||
| -rw-r--r-- | llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/SystemZ/SystemZLongBranch.cpp | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 21 |
16 files changed, 227 insertions, 37 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 40e6400..c8a038f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1916,6 +1916,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal); setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal); } + + // Handle floating-point partial reduction + if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) { + setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::nxv4f32, + MVT::nxv8f16, Legal); + } } // Handle non-aliasing elements mask @@ -2283,6 +2289,11 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { MVT::getVectorVT(MVT::i8, NumElts * 8), Custom); } + if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) { + setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT, + MVT::getVectorVT(MVT::f16, NumElts * 2), Custom); + } + // Lower fixed length vector operations to scalable equivalents. setOperationAction(ISD::ABDS, VT, Default); setOperationAction(ISD::ABDU, VT, Default); @@ -7875,6 +7886,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: return LowerPARTIAL_REDUCE_MLA(Op, DAG); } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 58a53af..bb2f083 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -13292,18 +13292,24 @@ multiclass AtomicFPStore<bit R, bits<3> op0, string asm> { def H : BaseAtomicFPStore<FPR16, 0b01, R, op0, asm>; } -class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind> +class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind, list<dag> pattern> : BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101, - V128, asm, ".16b", []> { + V128, asm, ".16b", pattern> { let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b", "|", kind, "\t$Rd, $Rn, $Rm}"); } -multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{ - def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h">{ +multiclass SIMDThreeSameVectorFP8MatrixMul<string asm, SDPatternOperator OpNode>{ + def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h", + [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { let Predicates = [HasNEON, HasF8F16MM]; } - def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s">{ + def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s", + [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { let Predicates = [HasNEON, HasF8F32MM]; } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index e6954f7..76f076a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -11417,7 +11417,7 @@ let Predicates = [HasF16F32MM] in defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">; let Uses = [FPMR, FPCR] in - defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">; + defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla", int_aarch64_neon_fmmla>; //===----------------------------------------------------------------------===// // Contention Management Hints (FEAT_CMH) diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 3b268dc..e1f4386 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -375,6 +375,11 @@ def AArch64fclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm), node:$Zm) ]>; +def AArch64fdot : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm), + [(int_aarch64_sve_fdot_x2 node:$Zd, node:$Zn, node:$Zm), + (partial_reduce_fmla node:$Zd, node:$Zn, node:$Zm) + ]>; + def SDT_AArch64FCVT : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>, SDTCisSameAs<0,3> @@ -4251,7 +4256,7 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>; let Predicates = [HasSVE2p1_or_SME2] in { defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>; -defm FDOT_ZZZ_S : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>; +defm FDOT_ZZZ_S : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, AArch64fdot>; defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>; defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index 65e6ed9..c52eb4e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -57,10 +57,11 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, const UniformityInfo &UI, ValueMap<const Value *, bool> &Tracker) { llvm::Intrinsic::ID IID = II.getIntrinsicID(); - + /// We deliberately do not simplify readfirstlane with a uniform argument, so + /// that frontends can use it to force a copy to SGPR and thereby prevent the + /// backend from generating unwanted waterfall loops. switch (IID) { case Intrinsic::amdgcn_permlane64: - case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: { Value *Src = II.getArgOperand(0); if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) @@ -107,7 +108,7 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, return Changed; } default: - llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic"); + return false; } return false; } @@ -121,16 +122,6 @@ static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) { auto *II = dyn_cast<IntrinsicInst>(&I); if (!II) continue; - - switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_permlane64: - case Intrinsic::amdgcn_readfirstlane: - case Intrinsic::amdgcn_readlane: - case Intrinsic::amdgcn_ballot: - break; - default: - continue; - } IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); } return IsChanged; diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 84984a0..964309b 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -681,6 +681,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { return false; MI->setDesc(TII->get(NewMFMAOpc)); MI->untieRegOperand(0); + const MCInstrDesc &MCID = MI->getDesc(); + for (unsigned I = 0; I < MI->getNumDefs(); ++I) + if (MCID.getOperandConstraint(I, MCOI::EARLY_CLOBBER) != -1) + MI->getOperand(I).setIsEarlyClobber(true); } // TODO: Should we try to avoid adding this to the candidate list? diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6ce18ea..9c74c65 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10163,7 +10163,7 @@ static bool followSubRegDef(MachineInstr &MI, } MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI) { assert(MRI.isSSA()); if (!P.Reg.isVirtual()) return nullptr; @@ -10628,6 +10628,8 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, const SIRegisterInfo &RI) { MachineInstr *KillsSCC = nullptr; + if (SCCValid->getParent() != SCCRedefine->getParent()) + return false; for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()), SCCRedefine->getIterator())) { if (MI.modifiesRegister(AMDGPU::SCC, &RI)) @@ -10672,8 +10674,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (CmpValue != 0) return false; - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) + MachineInstr *Def = MRI->getVRegDef(SrcReg); + if (!Def) return false; // For S_OP that set SCC = DST!=0, do the transformation @@ -10692,6 +10694,32 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (!optimizeSCC(Def, &CmpInstr, RI)) return false; + // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit + // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a + // 64-bit foldableSelect then delete s_or_b32 in the sequence: + // sX = s_cselect_b64 (non-zero imm), 0 + // sLo = copy sX.sub0 + // sHi = copy sX.sub1 + // sY = s_or_b32 sLo, sHi + if (Def->getOpcode() == AMDGPU::S_OR_B32 && + MRI->use_nodbg_empty(Def->getOperand(0).getReg())) { + const MachineOperand &OrOpnd1 = Def->getOperand(1); + const MachineOperand &OrOpnd2 = Def->getOperand(2); + if (OrOpnd1.isReg() && OrOpnd2.isReg()) { + MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg()); + MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg()); + if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 && + Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() && + Def2->getOperand(1).isReg() && + Def1->getOperand(1).getSubReg() == AMDGPU::sub0 && + Def2->getOperand(1).getSubReg() == AMDGPU::sub1 && + Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) { + MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg()); + if (Select && foldableSelect(*Select)) + optimizeSCC(Select, Def, RI); + } + } + } return true; }; @@ -10721,8 +10749,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) + MachineInstr *Def = MRI->getVRegDef(SrcReg); + if (!Def) return false; if (Def->getOpcode() != AMDGPU::S_AND_B32 && diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 0643b53..8d693b1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1687,7 +1687,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, /// skipping copy like instructions and subreg-manipulation pseudos. /// Following another subreg of a reg:subreg isn't supported. MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI); + const MachineRegisterInfo &MRI); /// \brief Return false if EXEC is not changed between the def of \p VReg at \p /// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index bfac639..caff354 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1334,20 +1334,21 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const MCInstrDesc &Desc = TII->get(MI.getOpcode()); unsigned ConstantBusCount = 0; for (MachineOperand &Op : MI.explicit_uses()) { - if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) - continue; - - unsigned I = Op.getOperandNo(); + if (Op.isReg()) { + if (TRI->isVGPR(*MRI, Op.getReg())) + continue; - int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]); - if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass))) + if (ST.hasSDWAScalar() && ConstantBusCount == 0) { + ++ConstantBusCount; + continue; + } + } else if (!Op.isImm()) continue; - if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && - TRI->isSGPRReg(*MRI, Op.getReg())) { - ++ConstantBusCount; + unsigned I = Op.getOperandNo(); + const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I, TRI); + if (!OpRC || !TRI->isVSSuperClass(OpRC)) continue; - } Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 20fc849..dd233e2 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -657,6 +657,17 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom); + if (Subtarget.isISA3_0() && isPPC64) { + setOperationAction(ISD::VP_STORE, MVT::v16i1, Custom); + setOperationAction(ISD::VP_STORE, MVT::v8i1, Custom); + setOperationAction(ISD::VP_STORE, MVT::v4i1, Custom); + setOperationAction(ISD::VP_STORE, MVT::v2i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v16i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v8i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v4i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v2i1, Custom); + } + // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom); @@ -11917,6 +11928,62 @@ SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op, return getDataClassTest(LHS, Category, Dl, DAG, Subtarget); } +// Adjust the length value for a load/store with length to account for the +// instructions requiring a left justified length, and for non-byte element +// types requiring scaling by element size. +static SDValue AdjustLength(SDValue Val, unsigned Bits, bool Left, + SelectionDAG &DAG) { + SDLoc dl(Val); + EVT VT = Val->getValueType(0); + unsigned LeftAdj = Left ? VT.getSizeInBits() - 8 : 0; + unsigned TypeAdj = llvm::countr_zero<uint32_t>(Bits / 8); + SDValue SHLAmt = DAG.getConstant(LeftAdj + TypeAdj, dl, VT); + return DAG.getNode(ISD::SHL, dl, VT, Val, SHLAmt); +} + +SDValue PPCTargetLowering::LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const { + auto VPLD = cast<VPLoadSDNode>(Op); + bool Future = Subtarget.isISAFuture(); + SDLoc dl(Op); + assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(3).getNode(), true) && + "Mask predication not supported"); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Len = DAG.getNode(ISD::ANY_EXTEND, dl, PtrVT, VPLD->getOperand(4)); + unsigned IID = Future ? Intrinsic::ppc_vsx_lxvrl : Intrinsic::ppc_vsx_lxvl; + unsigned EltBits = Op->getValueType(0).getScalarType().getSizeInBits(); + Len = AdjustLength(Len, EltBits, !Future, DAG); + SDValue Ops[] = {VPLD->getChain(), DAG.getConstant(IID, dl, MVT::i32), + VPLD->getOperand(1), Len}; + SDVTList Tys = DAG.getVTList(Op->getValueType(0), MVT::Other); + SDValue VPL = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, Tys, Ops, + VPLD->getMemoryVT(), VPLD->getMemOperand()); + return VPL; +} + +SDValue PPCTargetLowering::LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const { + auto VPST = cast<VPStoreSDNode>(Op); + assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(4).getNode(), true) && + "Mask predication not supported"); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc dl(Op); + SDValue Len = DAG.getNode(ISD::ANY_EXTEND, dl, PtrVT, VPST->getOperand(5)); + unsigned EltBits = + Op->getOperand(1).getValueType().getScalarType().getSizeInBits(); + bool Future = Subtarget.isISAFuture(); + unsigned IID = Future ? Intrinsic::ppc_vsx_stxvrl : Intrinsic::ppc_vsx_stxvl; + Len = AdjustLength(Len, EltBits, !Future, DAG); + SDValue Ops[] = { + VPST->getChain(), DAG.getConstant(IID, dl, MVT::i32), + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, VPST->getOperand(1)), + VPST->getOperand(2), Len}; + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue VPS = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops, + VPST->getMemoryVT(), VPST->getMemOperand()); + return VPS; +} + SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -12771,6 +12838,10 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { if (Op->getFlags().hasNoFPExcept()) return Op; return SDValue(); + case ISD::VP_LOAD: + return LowerVP_LOAD(Op, DAG); + case ISD::VP_STORE: + return LowerVP_STORE(Op, DAG); } } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 880aca7..d967018 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1345,6 +1345,9 @@ namespace llvm { SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index b04e887..e74f1bd 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -24,6 +24,10 @@ using namespace llvm; #define DEBUG_TYPE "ppctti" +static cl::opt<bool> Pwr9EVL("ppc-pwr9-evl", + cl::desc("Allow vp.load and vp.store for pwr9"), + cl::init(false), cl::Hidden); + static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost", cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden); @@ -1031,3 +1035,42 @@ bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, bool PPCTTIImpl::supportsTailCallFor(const CallBase *CB) const { return TLI->supportsTailCallFor(CB); } + +// Target hook used by CodeGen to decide whether to expand vector predication +// intrinsics into scalar operations or to use special ISD nodes to represent +// them. The Target will not see the intrinsics. +TargetTransformInfo::VPLegalization +PPCTTIImpl::getVPLegalizationStrategy(const VPIntrinsic &PI) const { + using VPLegalization = TargetTransformInfo::VPLegalization; + unsigned Directive = ST->getCPUDirective(); + VPLegalization DefaultLegalization = BaseT::getVPLegalizationStrategy(PI); + if (Directive != PPC::DIR_PWR10 && Directive != PPC::DIR_PWR_FUTURE && + (!Pwr9EVL || Directive != PPC::DIR_PWR9)) + return DefaultLegalization; + + if (!ST->isPPC64()) + return DefaultLegalization; + + unsigned IID = PI.getIntrinsicID(); + if (IID != Intrinsic::vp_load && IID != Intrinsic::vp_store) + return DefaultLegalization; + + bool IsLoad = IID == Intrinsic::vp_load; + Type *VecTy = IsLoad ? PI.getType() : PI.getOperand(0)->getType(); + EVT VT = TLI->getValueType(DL, VecTy, true); + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && + VT != MVT::v16i8) + return DefaultLegalization; + + auto IsAllTrueMask = [](Value *MaskVal) { + if (Value *SplattedVal = getSplatValue(MaskVal)) + if (auto *ConstValue = dyn_cast<Constant>(SplattedVal)) + return ConstValue->isAllOnesValue(); + return false; + }; + unsigned MaskIx = IsLoad ? 1 : 2; + if (!IsAllTrueMask(PI.getOperand(MaskIx))) + return DefaultLegalization; + + return VPLegalization(VPLegalization::Legal, VPLegalization::Legal); +} diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 8d7f255..f80ebdb 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -150,6 +150,9 @@ public: ArrayRef<Type *> Types) const override; bool supportsTailCallFor(const CallBase *CB) const override; + TargetTransformInfo::VPLegalization + getVPLegalizationStrategy(const VPIntrinsic &PI) const override; + private: // The following constant is used for estimating costs on power9. static const InstructionCost::CostType P9PipelineFlushEstimate = 80; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 7bc0b5b..332433b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2140,7 +2140,8 @@ InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // Assume memory ops cost scale with the number of vector registers // possible accessed by the instruction. Note that BasicTTI already // handles the LT.first term for us. - if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize) + if (ST->hasVInstructions() && LT.second.isVector() && + CostKind != TTI::TCK_CodeSize) BaseCost *= TLI->getLMULCost(LT.second); return Cost + BaseCost; } diff --git a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp index 21a233b2..b7a93e7 100644 --- a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp +++ b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp @@ -216,6 +216,7 @@ static unsigned getInstSizeInBytes(const MachineInstr &MI, MI.isDebugOrPseudoInstr() || MI.isPosition() || MI.isKill() || MI.isImplicitDef() || MI.getOpcode() == TargetOpcode::MEMBARRIER || MI.getOpcode() == TargetOpcode::INIT_UNDEF || MI.isFakeUse() || + MI.getOpcode() == TargetOpcode::RELOC_NONE || // These have a size that may be zero: MI.isInlineAsm() || MI.getOpcode() == SystemZ::STACKMAP || MI.getOpcode() == SystemZ::PATCHPOINT || diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 090f649..05a854a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45022,11 +45022,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( case X86ISD::INSERTPS: case X86ISD::BLENDI: case X86ISD::PSHUFB: + case X86ISD::VZEXT_MOVL: case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFP: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPV: case X86ISD::VPERMILPI: + case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: { SmallVector<int, 8> Mask; @@ -45052,6 +45057,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( } break; } + case X86ISD::VBROADCAST: { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + if (SrcVT.isVector()) { + APInt DemandedSrc = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0); + return DAG.isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrc, PoisonOnly, + Depth + 1); + } + return DAG.isGuaranteedNotToBeUndefOrPoison(Src, PoisonOnly, Depth + 1); + } } return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( Op, DemandedElts, DAG, PoisonOnly, Depth); @@ -45096,13 +45111,19 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( // SSE target shuffles. case X86ISD::INSERTPS: case X86ISD::PSHUFB: + case X86ISD::VZEXT_MOVL: case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFP: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPV: case X86ISD::VPERMILPI: + case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: + case X86ISD::VBROADCAST: return false; // SSE comparisons handle all icmp/fcmp cases. // TODO: Add CMPM/MM with test coverage. |
