diff options
Diffstat (limited to 'llvm/lib/Target')
26 files changed, 533 insertions, 231 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 84f6d42..8617377 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8787,51 +8787,6 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { return ZExtBool; } -// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the -// input operands are copy nodes where the source register is in a -// StridedOrContiguous class. For example: -// -// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO .. -// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous -// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous -// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO .. -// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous -// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous -// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr -// -bool shouldUseFormStridedPseudo(MachineInstr &MI) { - MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); - - assert((MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || - MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) && - "Unexpected opcode."); - - MCRegister SubReg = MCRegister::NoRegister; - for (unsigned I = 1; I < MI.getNumOperands(); ++I) { - MachineOperand &MO = MI.getOperand(I); - assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE"); - - MachineOperand *Def = MRI.getOneDef(MO.getReg()); - if (!Def || !Def->getParent()->isCopy()) - return false; - - const MachineOperand &CopySrc = Def->getParent()->getOperand(1); - unsigned OpSubReg = CopySrc.getSubReg(); - if (SubReg == MCRegister::NoRegister) - SubReg = OpSubReg; - - MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg()); - const TargetRegisterClass *CopySrcClass = - MRI.getRegClass(CopySrcOp->getReg()); - if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg || - (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass && - CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass)) - return false; - } - - return true; -} - void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { // Live-in physreg copies that are glued to SMSTART are applied as @@ -8857,27 +8812,6 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, } } - if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || - MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) { - // If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies - // from a StridedOrContiguous class, fall back on REG_SEQUENCE node. - if (shouldUseFormStridedPseudo(MI)) - return; - - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), - TII->get(TargetOpcode::REG_SEQUENCE), - MI.getOperand(0).getReg()); - - for (unsigned I = 1; I < MI.getNumOperands(); ++I) { - MIB.add(MI.getOperand(I)); - MIB.addImm(AArch64::zsub0 + (I - 1)); - } - - MI.eraseFromParent(); - return; - } - // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that // have nothing to do with VG, were it not that they are used to materialise a // frame-address. If they contain a frame-index to a scalable vector, this diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 3c57ba4..a0928b9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -428,7 +428,6 @@ def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>; def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisVT<2, OtherVT>]>; - def SDT_AArch64CSel : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, @@ -451,6 +450,7 @@ def SDT_AArch64FCCMP : SDTypeProfile<1, 5, def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisFP<1>, SDTCisSameAs<2, 1>]>; +def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>; def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>; def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>; def SDT_AArch64Insr : SDTypeProfile<1, 2, [SDTCisVec<0>]>; @@ -817,11 +817,9 @@ def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>; def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>; def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>; -def AArch64rev16_scalar : SDNode<"AArch64ISD::REV16", SDTIntUnaryOp>; - -def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>; -def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>; -def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>; +def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64Rev>; +def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64Rev>; +def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64Rev>; def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>; def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>; @@ -3000,8 +2998,8 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>; def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>; def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>; -def : Pat<(AArch64rev16_scalar GPR32:$Rn), (REV16Wr GPR32:$Rn)>; -def : Pat<(AArch64rev16_scalar GPR64:$Rn), (REV16Xr GPR64:$Rn)>; +def : Pat<(AArch64rev16 GPR32:$Rn), (REV16Wr GPR32:$Rn)>; +def : Pat<(AArch64rev16 GPR64:$Rn), (REV16Xr GPR64:$Rn)>; def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)), (and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))), diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index aae2fda..a6edcf1 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -940,6 +940,16 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } break; } + case Intrinsic::experimental_cttz_elts: { + EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); + if (!getTLI()->shouldExpandCttzElements(ArgVT)) { + // This will consist of a SVE brkb and a cntp instruction. These + // typically have the same latency and half the throughput as a vector + // add instruction. + return 4; + } + break; + } default: break; } diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 0ac131e..4f6a413 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -36,27 +36,26 @@ let WantsRoot = true in def am_sme_indexed_b4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0, 15>">; // The FORM_TRANSPOSED_REG_TUPLE pseudos defined below are intended to -// improve register allocation for intrinsics which use strided and contiguous -// multi-vector registers, avoiding unnecessary copies. -// If the operands of the pseudo are copies where the source register is in -// the StridedOrContiguous class, the pseudo is used to provide a hint to the -// register allocator suggesting a contigious multi-vector register which -// matches the subregister sequence used by the operands. -// If the operands do not match this pattern, the pseudos are expanded -// to a REG_SEQUENCE using the post-isel hook. +// improve register allocation for intrinsics which use strided and +// contiguous multi-vector registers, avoiding unnecessary copies. +// The SMEPeepholeOpt pass will replace a REG_SEQUENCE instruction with the +// FORM_TRANSPOSED_REG_TUPLE pseudo if the operands are copies where the +// source register is in the StridedOrContiguous class. The operands in the +// sequence must all have the same subreg index. +// The pseudo is then used to provide a hint to the register allocator +// suggesting a contigious multi-vector register which matches the +// subregister sequence used by the operands. def FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO : Pseudo<(outs ZPR2:$tup), (ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{ let hasSideEffects = 0; - let hasPostISelHook = 1; } def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO : Pseudo<(outs ZPR4:$tup), (ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{ let hasSideEffects = 0; - let hasPostISelHook = 1; } def SPILL_PPR_TO_ZPR_SLOT_PSEUDO : @@ -178,14 +177,14 @@ class SME2_ZA_TwoOp_Multi_Single_Pat<string name, SDPatternOperator intrinsic, O class SME2_ZA_TwoOp_VG2_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt, ComplexPattern tileslice> : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm), - (!cast<Instruction>(name # _PSEUDO) $base, $offset, (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1, vt:$Zn2), + (!cast<Instruction>(name # _PSEUDO) $base, $offset, (REG_SEQUENCE ZPR2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm)>; class SME2_ZA_TwoOp_VG4_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt, ComplexPattern tileslice> : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm), (!cast<Instruction>(name # _PSEUDO) $base, $offset, - (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4), + (REG_SEQUENCE ZPR4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3), zpr_ty:$Zm)>; class SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ValueType vt, ComplexPattern tileslice> @@ -211,14 +210,14 @@ class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic Operand imm_ty, ComplexPattern tileslice> : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)), (!cast<Instruction>(name # _PSEUDO) $base, $offset, - (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>; + (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>; class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt, Operand imm_ty, ComplexPattern tileslice> : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)), (!cast<Instruction>(name # _PSEUDO) $base, $offset, - (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4), + (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3), zpr_ty:$Zm, imm_ty:$i)>; class SME2_Sat_Shift_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty> diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp index 4a0312d..2ffd4d7 100644 --- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp @@ -45,6 +45,7 @@ struct SMEPeepholeOpt : public MachineFunctionPass { bool optimizeStartStopPairs(MachineBasicBlock &MBB, bool &HasRemovedAllSMChanges) const; + bool visitRegSequence(MachineInstr &MI); }; char SMEPeepholeOpt::ID = 0; @@ -225,6 +226,81 @@ bool SMEPeepholeOpt::optimizeStartStopPairs( return Changed; } +// Using the FORM_TRANSPOSED_REG_TUPLE pseudo can improve register allocation +// of multi-vector intrinsics. However, the psuedo should only be emitted if +// the input registers of the REG_SEQUENCE are copy nodes where the source +// register is in a StridedOrContiguous class. For example: +// +// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO .. +// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous +// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous +// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO .. +// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous +// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous +// %9:zpr2mul2 = REG_SEQUENCE %5:zpr, %subreg.zsub0, %8:zpr, %subreg.zsub1 +// +// -> %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr +// +bool SMEPeepholeOpt::visitRegSequence(MachineInstr &MI) { + assert(MI.getMF()->getRegInfo().isSSA() && "Expected to be run on SSA form!"); + + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + switch (MRI.getRegClass(MI.getOperand(0).getReg())->getID()) { + case AArch64::ZPR2RegClassID: + case AArch64::ZPR4RegClassID: + case AArch64::ZPR2Mul2RegClassID: + case AArch64::ZPR4Mul4RegClassID: + break; + default: + return false; + } + + // The first operand is the register class created by the REG_SEQUENCE. + // Each operand pair after this consists of a vreg + subreg index, so + // for example a sequence of 2 registers will have a total of 5 operands. + if (MI.getNumOperands() != 5 && MI.getNumOperands() != 9) + return false; + + MCRegister SubReg = MCRegister::NoRegister; + for (unsigned I = 1; I < MI.getNumOperands(); I += 2) { + MachineOperand &MO = MI.getOperand(I); + + MachineOperand *Def = MRI.getOneDef(MO.getReg()); + if (!Def || !Def->getParent()->isCopy()) + return false; + + const MachineOperand &CopySrc = Def->getParent()->getOperand(1); + unsigned OpSubReg = CopySrc.getSubReg(); + if (SubReg == MCRegister::NoRegister) + SubReg = OpSubReg; + + MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg()); + if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg || + CopySrcOp->getReg().isPhysical()) + return false; + + const TargetRegisterClass *CopySrcClass = + MRI.getRegClass(CopySrcOp->getReg()); + if (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass && + CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass) + return false; + } + + unsigned Opc = MI.getNumOperands() == 5 + ? AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO + : AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO; + + const TargetInstrInfo *TII = + MI.getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(Opc), MI.getOperand(0).getReg()); + for (unsigned I = 1; I < MI.getNumOperands(); I += 2) + MIB.addReg(MI.getOperand(I).getReg()); + + MI.eraseFromParent(); + return true; +} + INITIALIZE_PASS(SMEPeepholeOpt, "aarch64-sme-peephole-opt", "SME Peephole Optimization", false, false) @@ -247,6 +323,12 @@ bool SMEPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { bool BlockHasAllSMChangesRemoved; Changed |= optimizeStartStopPairs(MBB, BlockHasAllSMChangesRemoved); FunctionHasAllSMChangesRemoved |= BlockHasAllSMChangesRemoved; + + if (MF.getSubtarget<AArch64Subtarget>().isStreaming()) { + for (MachineInstr &MI : make_early_inc_range(MBB)) + if (MI.getOpcode() == AArch64::REG_SEQUENCE) + Changed |= visitRegSequence(MI); + } } AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index cca9fa7..792e17e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4217,18 +4217,21 @@ SDValue AMDGPUTargetLowering::performTruncateCombine( // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y) if (Src.getOpcode() == ISD::SRL && !VT.isVector()) { if (auto *K = isConstOrConstSplat(Src.getOperand(1))) { - if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) { - SDValue BV = stripBitcast(Src.getOperand(0)); - if (BV.getOpcode() == ISD::BUILD_VECTOR && - BV.getValueType().getVectorNumElements() == 2) { - SDValue SrcElt = BV.getOperand(1); - EVT SrcEltVT = SrcElt.getValueType(); - if (SrcEltVT.isFloatingPoint()) { - SrcElt = DAG.getNode(ISD::BITCAST, SL, - SrcEltVT.changeTypeToInteger(), SrcElt); + SDValue BV = stripBitcast(Src.getOperand(0)); + if (BV.getOpcode() == ISD::BUILD_VECTOR) { + EVT SrcEltVT = BV.getOperand(0).getValueType(); + unsigned SrcEltSize = SrcEltVT.getSizeInBits(); + unsigned BitIndex = K->getZExtValue(); + unsigned PartIndex = BitIndex / SrcEltSize; + + if (PartIndex * SrcEltSize == BitIndex && + PartIndex < BV.getNumOperands()) { + if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) { + SDValue SrcElt = + DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(), + BV.getOperand(PartIndex)); + return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); } - - return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 5bfd891..09f7877 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { return 1024; } -// FIXME: Should we use narrower types for local/region, or account for when -// unaligned access is legal? Type *GCNTTIImpl::getMemcpyLoopLoweringType( LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, @@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType( if (AtomicElementSize) return Type::getIntNTy(Context, *AtomicElementSize * 8); - Align MinAlign = std::min(SrcAlign, DestAlign); - - // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the - // hardware into byte accesses. If you assume all alignments are equally - // probable, it's more efficient on average to use short accesses for this - // case. - if (MinAlign == Align(2)) - return Type::getInt16Ty(Context); - - // Not all subtargets have 128-bit DS instructions, and we currently don't - // form them by default. - if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS || - SrcAddrSpace == AMDGPUAS::REGION_ADDRESS || - DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS || - DestAddrSpace == AMDGPUAS::REGION_ADDRESS) { - return FixedVectorType::get(Type::getInt32Ty(Context), 2); - } - - // Global memory works best with 16-byte accesses. + // 16-byte accesses achieve the highest copy throughput. // If the operation has a fixed known length that is large enough, it is // worthwhile to return an even wider type and let legalization lower it into - // multiple accesses, effectively unrolling the memcpy loop. Private memory - // also hits this, although accesses may be decomposed. + // multiple accesses, effectively unrolling the memcpy loop. + // We also rely on legalization to decompose into smaller accesses for + // subtargets and address spaces where it is necessary. // // Don't unroll if Length is not a constant, since unrolling leads to worse // performance for length values that are smaller or slightly larger than the @@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType( OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign, DestAlign, AtomicCpySize); - Align MinAlign = std::min(SrcAlign, DestAlign); - - if (MinAlign != Align(2)) { - Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4); - while (RemainingBytes >= 16) { - OpsOut.push_back(I32x4Ty); - RemainingBytes -= 16; - } + Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4); + while (RemainingBytes >= 16) { + OpsOut.push_back(I32x4Ty); + RemainingBytes -= 16; + } - Type *I64Ty = Type::getInt64Ty(Context); - while (RemainingBytes >= 8) { - OpsOut.push_back(I64Ty); - RemainingBytes -= 8; - } + Type *I64Ty = Type::getInt64Ty(Context); + while (RemainingBytes >= 8) { + OpsOut.push_back(I64Ty); + RemainingBytes -= 8; + } - Type *I32Ty = Type::getInt32Ty(Context); - while (RemainingBytes >= 4) { - OpsOut.push_back(I32Ty); - RemainingBytes -= 4; - } + Type *I32Ty = Type::getInt32Ty(Context); + while (RemainingBytes >= 4) { + OpsOut.push_back(I32Ty); + RemainingBytes -= 4; } Type *I16Ty = Type::getInt16Ty(Context); diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index a20319e..ac11526 100644 --- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -287,10 +287,10 @@ bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, std::vector<std::pair<unsigned, unsigned>> &RemapChan) { unsigned NeededUndefs = 4 - RSI.UndefReg.size(); - if (PreviousRegSeqByUndefCount[NeededUndefs].empty()) - return false; std::vector<MachineInstr *> &MIs = PreviousRegSeqByUndefCount[NeededUndefs]; + if (MIs.empty()) + return false; CompatibleRSI = PreviousRegSeq[MIs.back()]; tryMergeVector(&CompatibleRSI, &RSI, RemapChan); return true; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index bee4c47..6e08aff 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2703,15 +2703,20 @@ class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, S (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) >; -let OtherPredicates = [NotHasTrue16BitInsts] in { +let True16Predicate = NotHasTrue16BitInsts in { def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; -} // end OtherPredicates = [NotHasTrue16BitInsts] +} // end True16Predicate = NotHasTrue16BitInsts + +let True16Predicate = UseRealTrue16Insts in { + def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; + def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; +} // end True16Predicate = UseRealTrue16BitInsts -let OtherPredicates = [HasTrue16BitInsts] in { +let True16Predicate = UseFakeTrue16Insts in { def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; -} // end OtherPredicates = [HasTrue16BitInsts] +} // end True16Predicate = UseFakeTrue16BitInsts def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>; @@ -3790,6 +3795,13 @@ def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse> def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; } +let True16Predicate = UseRealTrue16Insts in { +def : FPMinMaxPat<V_MINMAX_F16_t16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinMaxPat<V_MAXMIN_F16_t16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; +def : FPMinCanonMaxPat<V_MINMAX_F16_t16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinCanonMaxPat<V_MAXMIN_F16_t16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; +} + let True16Predicate = UseFakeTrue16Insts in { def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; @@ -3819,6 +3831,13 @@ def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum> def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>; } +let True16Predicate = UseRealTrue16Insts, SubtargetPredicate = isGFX12Plus in { +def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_t16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; +def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_t16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; +def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_t16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; +def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_t16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; +} + let True16Predicate = UseFakeTrue16Insts, SubtargetPredicate = isGFX12Plus in { def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index fad7e67..67bebfb3 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -976,8 +976,7 @@ struct Waitcnt { Waitcnt() = default; // Pre-gfx12 constructor. Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt) - : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt), - SampleCnt(~0u), BvhCnt(~0u), KmCnt(~0u) {} + : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt) {} // gfx12+ constructor. Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index 1e76bf7..296031e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -27,6 +27,28 @@ using namespace llvm; using namespace llvm::AMDGPU; +// Return the PAL metadata hardware shader stage name. +static const char *getStageName(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_PS: + return ".ps"; + case CallingConv::AMDGPU_VS: + return ".vs"; + case CallingConv::AMDGPU_GS: + return ".gs"; + case CallingConv::AMDGPU_ES: + return ".es"; + case CallingConv::AMDGPU_HS: + return ".hs"; + case CallingConv::AMDGPU_LS: + return ".ls"; + case CallingConv::AMDGPU_Gfx: + llvm_unreachable("Callable shader has no hardware stage"); + default: + return ".cs"; + } +} + // Read the PAL metadata from IR metadata, where it was put by the frontend. void AMDGPUPALMetadata::readFromIR(Module &M) { auto *NamedMD = M.getNamedMetadata("amdgpu.pal.metadata.msgpack"); @@ -232,8 +254,18 @@ void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) { if (isLegacy()) return; // Msgpack format. + // Entry point is updated to .entry_point_symbol and is set to the function + // name getHwStage(CC)[".entry_point_symbol"] = MsgPackDoc.getNode(Name, /*Copy=*/true); + + // Set .entry_point which is defined + // to be _amdgpu_<stage> and _amdgpu_cs for non-shader functions + SmallString<16> EPName("_amdgpu_"); + raw_svector_ostream EPNameOS(EPName); + EPNameOS << getStageName(CC) + 1; + getHwStage(CC)[".entry_point"] = + MsgPackDoc.getNode(EPNameOS.str(), /*Copy=*/true); } // Set the number of used vgprs in the metadata. This is an optional @@ -943,28 +975,6 @@ msgpack::MapDocNode AMDGPUPALMetadata::getGraphicsRegisters() { return GraphicsRegisters.getMap(); } -// Return the PAL metadata hardware shader stage name. -static const char *getStageName(CallingConv::ID CC) { - switch (CC) { - case CallingConv::AMDGPU_PS: - return ".ps"; - case CallingConv::AMDGPU_VS: - return ".vs"; - case CallingConv::AMDGPU_GS: - return ".gs"; - case CallingConv::AMDGPU_ES: - return ".es"; - case CallingConv::AMDGPU_HS: - return ".hs"; - case CallingConv::AMDGPU_LS: - return ".ls"; - case CallingConv::AMDGPU_Gfx: - llvm_unreachable("Callable shader has no hardware stage"); - default: - return ".cs"; - } -} - msgpack::DocNode &AMDGPUPALMetadata::refHwStage() { auto &N = MsgPackDoc.getRoot() diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 633a99d..74def43 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -163,6 +163,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; +def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; def True : Predicate<"true">; def False : Predicate<"false">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 56d8b73..a0d00e4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -7582,3 +7582,44 @@ def GRIDDEPCONTROL_WAIT : Requires<[hasSM<90>, hasPTX<78>]>; def INT_EXIT : NVPTXInst<(outs), (ins), "exit;", [(int_nvvm_exit)]>; + +// Tcgen05 intrinsics +let isConvergent = true in { + +multiclass TCGEN05_ALLOC_INTR<NVPTXRegClass rc, string AS, string num, Intrinsic Intr> { + def NAME : NVPTXInst<(outs), + (ins rc:$dst, Int32Regs:$ncols), + !strconcat("tcgen05.alloc.cta_group::", num, ".sync.aligned", AS, ".b32 [$dst], $ncols;"), + [(Intr rc:$dst, Int32Regs:$ncols)]>, + Requires<[hasTcgen05Instructions]>; +} + +defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, "", "1", int_nvvm_tcgen05_alloc_cg1>; +defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, "", "2", int_nvvm_tcgen05_alloc_cg2>; + +defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>; +defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>; + +defm TCGEN05_ALLOC_S32_CG1 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>; +defm TCGEN05_ALLOC_S32_CG2 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>; + +multiclass TCGEN05_DEALLOC_INTR<string num, Intrinsic Intr> { + def NAME : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr, Int32Regs:$ncols), + !strconcat("tcgen05.dealloc.cta_group::", num, ".sync.aligned.b32 $tmem_addr, $ncols;"), + [(Intr Int32Regs:$tmem_addr, Int32Regs:$ncols)]>, + Requires<[hasTcgen05Instructions]>; +} +defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1>; +defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>; + +multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> { + def NAME : NVPTXInst<(outs), (ins), + !strconcat("tcgen05.relinquish_alloc_permit.cta_group::", num, ".sync.aligned;"), + [(Intr)]>, + Requires<[hasTcgen05Instructions]>; +} +defm TCGEN05_RELINQ_CG1: TCGEN05_RELINQ_PERMIT_INTR<"1", int_nvvm_tcgen05_relinq_alloc_permit_cg1>; +defm TCGEN05_RELINQ_CG2: TCGEN05_RELINQ_PERMIT_INTR<"2", int_nvvm_tcgen05_relinq_alloc_permit_cg2>; + +} // isConvergent diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 919f487..0c4420b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -93,6 +93,21 @@ public: bool hasDotInstructions() const { return SmVersion >= 61 && PTXVersion >= 50; } + // Tcgen05 instructions in Blackwell family + bool hasTcgen05Instructions() const { + bool HasTcgen05 = false; + switch (FullSmVersion) { + default: + break; + case 1001: // sm_100a + case 1011: // sm_101a + HasTcgen05 = true; + break; + } + + return HasTcgen05 && PTXVersion >= 86; + } + // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction // terminates a basic block. Instead, it would assume that control flow // continued to the next instruction. The next instruction could be in the diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index e88027f..f2afa6f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -140,6 +140,9 @@ static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { else if (UseShortPointers) Ret += "-p3:32:32-p4:32:32-p5:32:32"; + // Tensor Memory (addrspace:6) is always 32-bits. + Ret += "-p6:32:32"; + Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; return Ret; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 36b8a24..274c7cb 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -65,9 +65,10 @@ public: /// for a given imm form load/store opcode \p ImmFormOpcode. /// FIXME: move this to PPCInstrInfo class. unsigned getMappedIdxOpcForImmOpc(unsigned ImmOpcode) const { - if (!ImmToIdxMap.count(ImmOpcode)) + auto It = ImmToIdxMap.find(ImmOpcode); + if (It == ImmToIdxMap.end()) return PPC::INSTRUCTION_LIST_END; - return ImmToIdxMap.find(ImmOpcode)->second; + return It->second; } /// getPointerRegClass - Return the register class to use to hold pointers. diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td index ad06f47..98e05b7 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.td +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td @@ -42,6 +42,8 @@ def CSR_ILP32D_LP64D_V // Needed for implementation of RISCVRegisterInfo::getNoPreservedMask() def CSR_NoRegs : CalleeSavedRegs<(add)>; +def CSR_IPRA : CalleeSavedRegs<(add X1)>; + // Interrupt handler needs to save/restore all registers that are used, // both Caller and Callee saved registers. def CSR_Interrupt : CalleeSavedRegs<(add X1, (sequence "X%u", 5, 31))>; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8e3caf5..7c3b583 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -17759,6 +17759,83 @@ static SDValue combineScalarCTPOPToVCPOP(SDNode *N, SelectionDAG &DAG, return DAG.getZExtOrTrunc(Pop, DL, VT); } +static SDValue performSHLCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const RISCVSubtarget &Subtarget) { + // (shl (zext x), y) -> (vwsll x, y) + if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget)) + return V; + + // (shl (sext x), C) -> (vwmulsu x, 1u << C) + // (shl (zext x), C) -> (vwmulu x, 1u << C) + + if (!DCI.isAfterLegalizeDAG()) + return SDValue(); + + SDValue LHS = N->getOperand(0); + if (!LHS.hasOneUse()) + return SDValue(); + unsigned Opcode; + switch (LHS.getOpcode()) { + case ISD::SIGN_EXTEND: + case RISCVISD::VSEXT_VL: + Opcode = RISCVISD::VWMULSU_VL; + break; + case ISD::ZERO_EXTEND: + case RISCVISD::VZEXT_VL: + Opcode = RISCVISD::VWMULU_VL; + break; + default: + return SDValue(); + } + + SDValue RHS = N->getOperand(1); + APInt ShAmt; + uint64_t ShAmtInt; + if (ISD::isConstantSplatVector(RHS.getNode(), ShAmt)) + ShAmtInt = ShAmt.getZExtValue(); + else if (RHS.getOpcode() == RISCVISD::VMV_V_X_VL && + RHS.getOperand(1).getOpcode() == ISD::Constant) + ShAmtInt = RHS.getConstantOperandVal(1); + else + return SDValue(); + + // Better foldings: + // (shl (sext x), 1) -> (vwadd x, x) + // (shl (zext x), 1) -> (vwaddu x, x) + if (ShAmtInt <= 1) + return SDValue(); + + SDValue NarrowOp = LHS.getOperand(0); + MVT NarrowVT = NarrowOp.getSimpleValueType(); + uint64_t NarrowBits = NarrowVT.getScalarSizeInBits(); + if (ShAmtInt >= NarrowBits) + return SDValue(); + MVT VT = N->getSimpleValueType(0); + if (NarrowBits * 2 != VT.getScalarSizeInBits()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + SDValue Passthru, Mask, VL; + switch (N->getOpcode()) { + case ISD::SHL: + Passthru = DAG.getUNDEF(VT); + std::tie(Mask, VL) = getDefaultScalableVLOps(VT, DL, DAG, Subtarget); + break; + case RISCVISD::SHL_VL: + Passthru = N->getOperand(2); + Mask = N->getOperand(3); + VL = N->getOperand(4); + break; + default: + llvm_unreachable("Expected SHL"); + } + return DAG.getNode(Opcode, DL, VT, NarrowOp, + DAG.getConstant(1ULL << ShAmtInt, SDLoc(RHS), NarrowVT), + Passthru, Mask, VL); +} + SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -18392,7 +18469,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, break; } case RISCVISD::SHL_VL: - if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget)) + if (SDValue V = performSHLCombine(N, DCI, Subtarget)) return V; [[fallthrough]]; case RISCVISD::SRA_VL: @@ -18417,7 +18494,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SRL: case ISD::SHL: { if (N->getOpcode() == ISD::SHL) { - if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget)) + if (SDValue V = performSHLCombine(N, DCI, Subtarget)) return V; } SDValue ShAmt = N->getOperand(1); diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index b0a5269..7a99bfd 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -56,6 +56,11 @@ RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode) /*PC*/0, HwMode) {} const MCPhysReg * +RISCVRegisterInfo::getIPRACSRegs(const MachineFunction *MF) const { + return CSR_IPRA_SaveList; +} + +const MCPhysReg * RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { auto &Subtarget = MF->getSubtarget<RISCVSubtarget>(); if (MF->getFunction().getCallingConv() == CallingConv::GHC) diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h index 3ab79694..6c4e9c7 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h @@ -62,6 +62,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo { const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg *getIPRACSRegs(const MachineFunction *MF) const override; + BitVector getReservedRegs(const MachineFunction &MF) const override; bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index fa7c7c5..cb2ec1d 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -940,6 +940,44 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost( return NumLoads * MemOpCost; } +InstructionCost RISCVTTIImpl::getExpandCompressMemoryOpCost( + unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, + TTI::TargetCostKind CostKind, const Instruction *I) { + bool IsLegal = (Opcode == Instruction::Store && + isLegalMaskedCompressStore(DataTy, Alignment)) || + (Opcode == Instruction::Load && + isLegalMaskedExpandLoad(DataTy, Alignment)); + if (!IsLegal || CostKind != TTI::TCK_RecipThroughput) + return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask, + Alignment, CostKind, I); + // Example compressstore sequence: + // vsetivli zero, 8, e32, m2, ta, ma (ignored) + // vcompress.vm v10, v8, v0 + // vcpop.m a1, v0 + // vsetvli zero, a1, e32, m2, ta, ma + // vse32.v v10, (a0) + // Example expandload sequence: + // vsetivli zero, 8, e8, mf2, ta, ma (ignored) + // vcpop.m a1, v0 + // vsetvli zero, a1, e32, m2, ta, ma + // vle32.v v10, (a0) + // vsetivli zero, 8, e32, m2, ta, ma + // viota.m v12, v0 + // vrgather.vv v8, v10, v12, v0.t + auto MemOpCost = + getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind); + auto LT = getTypeLegalizationCost(DataTy); + SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI}; + if (VariableMask) + Opcodes.push_back(RISCV::VCPOP_M); + if (Opcode == Instruction::Store) + Opcodes.append({RISCV::VCOMPRESS_VM}); + else + Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV}); + return MemOpCost + + LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind); +} + InstructionCost RISCVTTIImpl::getStridedMemoryOpCost( unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 042530b..5389e9b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -174,6 +174,12 @@ public: TTI::TargetCostKind CostKind, const Instruction *I); + InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src, + bool VariableMask, + Align Alignment, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); + InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 78db841..c202f7f 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -284,7 +284,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, // Adjust stack pointer. int StackAdj = StackAdjust.getImm(); int MaxTCDelta = X86FI->getTCReturnAddrDelta(); - int Offset = 0; + int64_t Offset = 0; assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive"); // Incoporate the retaddr area. @@ -297,7 +297,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, if (Offset) { // Check for possible merge with preceding ADD instruction. - Offset += X86FL->mergeSPUpdates(MBB, MBBI, true); + Offset = X86FL->mergeSPAdd(MBB, MBBI, Offset, true); X86FL->emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue=*/true); } diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index a15db03..50c56c9 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -223,6 +223,8 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { return false; } +constexpr int64_t MaxSPChunk = (1LL << 31) - 1; + /// emitSPUpdate - Emit a series of instructions to increment / decrement the /// stack pointer by a constant value. void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, @@ -242,7 +244,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, return; } - uint64_t Chunk = (1LL << 31) - 1; + uint64_t Chunk = MaxSPChunk; MachineFunction &MF = *MBB.getParent(); const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); @@ -391,12 +393,15 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( return MI; } -int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - bool doMergeWithPrevious) const { +template <typename FoundT, typename CalcT> +int64_t X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + FoundT FoundStackAdjust, + CalcT CalcNewOffset, + bool doMergeWithPrevious) const { if ((doMergeWithPrevious && MBBI == MBB.begin()) || (!doMergeWithPrevious && MBBI == MBB.end())) - return 0; + return CalcNewOffset(0); MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI; @@ -415,27 +420,38 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction()) PI = std::prev(PI); - unsigned Opc = PI->getOpcode(); - int Offset = 0; - - if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) && - PI->getOperand(0).getReg() == StackPtr) { - assert(PI->getOperand(1).getReg() == StackPtr); - Offset = PI->getOperand(2).getImm(); - } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) && - PI->getOperand(0).getReg() == StackPtr && - PI->getOperand(1).getReg() == StackPtr && - PI->getOperand(2).getImm() == 1 && - PI->getOperand(3).getReg() == X86::NoRegister && - PI->getOperand(5).getReg() == X86::NoRegister) { - // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg. - Offset = PI->getOperand(4).getImm(); - } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) && - PI->getOperand(0).getReg() == StackPtr) { - assert(PI->getOperand(1).getReg() == StackPtr); - Offset = -PI->getOperand(2).getImm(); - } else - return 0; + int64_t Offset = 0; + for (;;) { + unsigned Opc = PI->getOpcode(); + + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) && + PI->getOperand(0).getReg() == StackPtr) { + assert(PI->getOperand(1).getReg() == StackPtr); + Offset = PI->getOperand(2).getImm(); + } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) && + PI->getOperand(0).getReg() == StackPtr && + PI->getOperand(1).getReg() == StackPtr && + PI->getOperand(2).getImm() == 1 && + PI->getOperand(3).getReg() == X86::NoRegister && + PI->getOperand(5).getReg() == X86::NoRegister) { + // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg. + Offset = PI->getOperand(4).getImm(); + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) && + PI->getOperand(0).getReg() == StackPtr) { + assert(PI->getOperand(1).getReg() == StackPtr); + Offset = -PI->getOperand(2).getImm(); + } else + return CalcNewOffset(0); + + FoundStackAdjust(PI, Offset); + if (std::abs((int64_t)CalcNewOffset(Offset)) < MaxSPChunk) + break; + + if (doMergeWithPrevious ? (PI == MBB.begin()) : (PI == MBB.end())) + return CalcNewOffset(0); + + PI = doMergeWithPrevious ? std::prev(PI) : std::next(PI); + } PI = MBB.erase(PI); if (PI != MBB.end() && PI->isCFIInstruction()) { @@ -448,7 +464,16 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, if (!doMergeWithPrevious) MBBI = skipDebugInstructionsForward(PI, MBB.end()); - return Offset; + return CalcNewOffset(Offset); +} + +int64_t X86FrameLowering::mergeSPAdd(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + int64_t AddOffset, + bool doMergeWithPrevious) const { + return mergeSPUpdates( + MBB, MBBI, [AddOffset](int64_t Offset) { return AddOffset + Offset; }, + doMergeWithPrevious); } void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB, @@ -1975,8 +2000,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // If there is an SUB32ri of ESP immediately before this instruction, merge // the two. This can be the case when tail call elimination is enabled and - // the callee has more arguments then the caller. - NumBytes -= mergeSPUpdates(MBB, MBBI, true); + // the callee has more arguments than the caller. + NumBytes = mergeSPUpdates( + MBB, MBBI, [NumBytes](int64_t Offset) { return NumBytes - Offset; }, + true); // Adjust stack pointer: ESP -= numbytes. @@ -2457,7 +2484,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (HasFP) { if (X86FI->hasSwiftAsyncContext()) { // Discard the context. - int Offset = 16 + mergeSPUpdates(MBB, MBBI, true); + int64_t Offset = mergeSPAdd(MBB, MBBI, 16, true); emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true); } // Pop EBP. @@ -2531,7 +2558,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // If there is an ADD32ri or SUB32ri of ESP immediately before this // instruction, merge the two instructions. if (NumBytes || MFI.hasVarSizedObjects()) - NumBytes += mergeSPUpdates(MBB, MBBI, true); + NumBytes = mergeSPAdd(MBB, MBBI, NumBytes, true); // If dynamic alloca is used, then reset esp to point to the last callee-saved // slot before popping them off! Same applies for the case, when stack was @@ -2612,11 +2639,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) { // Add the return addr area delta back since we are not tail calling. - int Offset = -1 * X86FI->getTCReturnAddrDelta(); + int64_t Offset = -1 * X86FI->getTCReturnAddrDelta(); assert(Offset >= 0 && "TCDelta should never be positive"); if (Offset) { // Check for possible merge with preceding ADD instruction. - Offset += mergeSPUpdates(MBB, Terminator, true); + Offset = mergeSPAdd(MBB, Terminator, Offset, true); emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true); } } @@ -3814,13 +3841,24 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr( // Add Amount to SP to destroy a frame, or subtract to setup. int64_t StackAdjustment = isDestroy ? Amount : -Amount; + int64_t CfaAdjustment = StackAdjustment; if (StackAdjustment) { // Merge with any previous or following adjustment instruction. Note: the // instructions merged with here do not have CFI, so their stack - // adjustments do not feed into CfaAdjustment. - StackAdjustment += mergeSPUpdates(MBB, InsertPos, true); - StackAdjustment += mergeSPUpdates(MBB, InsertPos, false); + // adjustments do not feed into CfaAdjustment + + auto CalcCfaAdjust = [&CfaAdjustment](MachineBasicBlock::iterator PI, + int64_t Offset) { + CfaAdjustment += Offset; + }; + auto CalcNewOffset = [&StackAdjustment](int64_t Offset) { + return StackAdjustment + Offset; + }; + StackAdjustment = + mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, true); + StackAdjustment = + mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, false); if (StackAdjustment) { if (!(F.hasMinSize() && @@ -3830,7 +3868,7 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr( } } - if (DwarfCFI && !hasFP(MF)) { + if (DwarfCFI && !hasFP(MF) && CfaAdjustment) { // If we don't have FP, but need to generate unwind information, // we need to set the correct CFA offset after the stack adjustment. // How much we adjust the CFA offset depends on whether we're emitting @@ -3838,14 +3876,11 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr( // offset to be correct at each call site, while for debugging we want // it to be more precise. - int64_t CfaAdjustment = -StackAdjustment; // TODO: When not using precise CFA, we also need to adjust for the // InternalAmt here. - if (CfaAdjustment) { - BuildCFI( - MBB, InsertPos, DL, - MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment)); - } + BuildCFI( + MBB, InsertPos, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, -CfaAdjustment)); } return I; diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h index 02fe8ee..ef41b46 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -134,12 +134,50 @@ public: processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, RegScavenger *RS) const override; - /// Check the instruction before/after the passed instruction. If - /// it is an ADD/SUB/LEA instruction it is deleted argument and the - /// stack adjustment is returned as a positive value for ADD/LEA and - /// a negative for SUB. - int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - bool doMergeWithPrevious) const; +private: + /// Basic Pseudocode: + /// if (instruction before/after the passed instruction is ADD/SUB/LEA) + /// Offset = instruction stack adjustment + /// ... positive value for ADD/LEA and negative for SUB + /// FoundStackAdjust(instruction, Offset) + /// erase(instruction) + /// return CalcNewOffset(Offset) + /// else + /// return CalcNewOffset(0) + /// + /// It's possible that the selected instruction is not immediately + /// before/after MBBI for large adjustments that have been split into multiple + /// instructions. + /// + /// FoundStackAdjust should have the signature: + /// void FoundStackAdjust(MachineBasicBlock::iterator PI, int64_t Offset) + /// CalcNewOffset should have the signature: + /// int64_t CalcNewOffset(int64_t Offset) + template <typename FoundT, typename CalcT> + int64_t mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + FoundT FoundStackAdjust, CalcT CalcNewOffset, + bool doMergeWithPrevious) const; + + template <typename CalcT> + int64_t mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, CalcT CalcNewOffset, + bool doMergeWithPrevious) const { + auto FoundStackAdjust = [](MachineBasicBlock::iterator MBBI, + int64_t Offset) {}; + return mergeSPUpdates(MBB, MBBI, FoundStackAdjust, CalcNewOffset, + doMergeWithPrevious); + } + +public: + /// Equivalent to: + /// mergeSPUpdates(MBB, MBBI, + /// [AddOffset](int64_t Offset) { + /// return AddOffset + Offset; + /// }, + /// doMergeWithPrevious); + int64_t mergeSPAdd(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + int64_t AddOffset, bool doMergeWithPrevious) const; /// Emit a series of instructions to increment / decrement the stack /// pointer by a constant value. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8f90420..6cf6061 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2944,7 +2944,7 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model CM, } /// Return true if the condition is an signed comparison operation. -static bool isX86CCSigned(unsigned X86CC) { +static bool isX86CCSigned(X86::CondCode X86CC) { switch (X86CC) { default: llvm_unreachable("Invalid integer condition!"); @@ -22975,7 +22975,7 @@ static bool isProfitableToUseFlagOp(SDValue Op) { /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. -static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, +static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // CF and OF aren't always set the way we want. Determine which // of these we need. @@ -23085,7 +23085,7 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. -static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, +static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (isNullConstant(Op1)) @@ -23157,10 +23157,17 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, return Add.getValue(1); } - // Use SUB instead of CMP to enable CSE between SUB and CMP. + // If we already have an XOR of the ops, use that to check for equality. + // Else use SUB instead of CMP to enable CSE between SUB and CMP. + unsigned X86Opc = X86ISD::SUB; + if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && + (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) || + DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0}))) + X86Opc = X86ISD::XOR; + SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); - SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); - return Sub.getValue(1); + SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1); + return CmpOp.getValue(1); } bool X86TargetLowering::isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, |