diff options
author | mingmingl <mingmingl@google.com> | 2025-02-04 11:11:14 -0800 |
---|---|---|
committer | mingmingl <mingmingl@google.com> | 2025-02-04 11:11:14 -0800 |
commit | e91747a92d27ecf799427bf563f9f64f7c4d2447 (patch) | |
tree | 7aa5a8a9170deec293e152bdf2be804399dcd612 /llvm/lib/Target/AMDGPU | |
parent | 3a8d9337d816aef41c3ca1484be8b933a71a3c46 (diff) | |
parent | 53d6e59b594639417cdbfcfa2d18cea64acb4009 (diff) | |
download | llvm-users/mingmingl-llvm/spr/sdpglobalvariable.zip llvm-users/mingmingl-llvm/spr/sdpglobalvariable.tar.gz llvm-users/mingmingl-llvm/spr/sdpglobalvariable.tar.bz2 |
Merge branch 'main' into users/mingmingl-llvm/spr/sdpglobalvariableusers/mingmingl-llvm/spr/sdpglobalvariable
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 25 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 59 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 27 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp | 54 |
6 files changed, 90 insertions, 82 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index cca9fa7..792e17e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4217,18 +4217,21 @@ SDValue AMDGPUTargetLowering::performTruncateCombine( // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y) if (Src.getOpcode() == ISD::SRL && !VT.isVector()) { if (auto *K = isConstOrConstSplat(Src.getOperand(1))) { - if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) { - SDValue BV = stripBitcast(Src.getOperand(0)); - if (BV.getOpcode() == ISD::BUILD_VECTOR && - BV.getValueType().getVectorNumElements() == 2) { - SDValue SrcElt = BV.getOperand(1); - EVT SrcEltVT = SrcElt.getValueType(); - if (SrcEltVT.isFloatingPoint()) { - SrcElt = DAG.getNode(ISD::BITCAST, SL, - SrcEltVT.changeTypeToInteger(), SrcElt); + SDValue BV = stripBitcast(Src.getOperand(0)); + if (BV.getOpcode() == ISD::BUILD_VECTOR) { + EVT SrcEltVT = BV.getOperand(0).getValueType(); + unsigned SrcEltSize = SrcEltVT.getSizeInBits(); + unsigned BitIndex = K->getZExtValue(); + unsigned PartIndex = BitIndex / SrcEltSize; + + if (PartIndex * SrcEltSize == BitIndex && + PartIndex < BV.getNumOperands()) { + if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) { + SDValue SrcElt = + DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(), + BV.getOperand(PartIndex)); + return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); } - - return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 5bfd891..09f7877 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { return 1024; } -// FIXME: Should we use narrower types for local/region, or account for when -// unaligned access is legal? Type *GCNTTIImpl::getMemcpyLoopLoweringType( LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, @@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType( if (AtomicElementSize) return Type::getIntNTy(Context, *AtomicElementSize * 8); - Align MinAlign = std::min(SrcAlign, DestAlign); - - // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the - // hardware into byte accesses. If you assume all alignments are equally - // probable, it's more efficient on average to use short accesses for this - // case. - if (MinAlign == Align(2)) - return Type::getInt16Ty(Context); - - // Not all subtargets have 128-bit DS instructions, and we currently don't - // form them by default. - if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS || - SrcAddrSpace == AMDGPUAS::REGION_ADDRESS || - DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS || - DestAddrSpace == AMDGPUAS::REGION_ADDRESS) { - return FixedVectorType::get(Type::getInt32Ty(Context), 2); - } - - // Global memory works best with 16-byte accesses. + // 16-byte accesses achieve the highest copy throughput. // If the operation has a fixed known length that is large enough, it is // worthwhile to return an even wider type and let legalization lower it into - // multiple accesses, effectively unrolling the memcpy loop. Private memory - // also hits this, although accesses may be decomposed. + // multiple accesses, effectively unrolling the memcpy loop. + // We also rely on legalization to decompose into smaller accesses for + // subtargets and address spaces where it is necessary. // // Don't unroll if Length is not a constant, since unrolling leads to worse // performance for length values that are smaller or slightly larger than the @@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType( OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign, DestAlign, AtomicCpySize); - Align MinAlign = std::min(SrcAlign, DestAlign); - - if (MinAlign != Align(2)) { - Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4); - while (RemainingBytes >= 16) { - OpsOut.push_back(I32x4Ty); - RemainingBytes -= 16; - } + Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4); + while (RemainingBytes >= 16) { + OpsOut.push_back(I32x4Ty); + RemainingBytes -= 16; + } - Type *I64Ty = Type::getInt64Ty(Context); - while (RemainingBytes >= 8) { - OpsOut.push_back(I64Ty); - RemainingBytes -= 8; - } + Type *I64Ty = Type::getInt64Ty(Context); + while (RemainingBytes >= 8) { + OpsOut.push_back(I64Ty); + RemainingBytes -= 8; + } - Type *I32Ty = Type::getInt32Ty(Context); - while (RemainingBytes >= 4) { - OpsOut.push_back(I32Ty); - RemainingBytes -= 4; - } + Type *I32Ty = Type::getInt32Ty(Context); + while (RemainingBytes >= 4) { + OpsOut.push_back(I32Ty); + RemainingBytes -= 4; } Type *I16Ty = Type::getInt16Ty(Context); diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index a20319e..ac11526 100644 --- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -287,10 +287,10 @@ bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, std::vector<std::pair<unsigned, unsigned>> &RemapChan) { unsigned NeededUndefs = 4 - RSI.UndefReg.size(); - if (PreviousRegSeqByUndefCount[NeededUndefs].empty()) - return false; std::vector<MachineInstr *> &MIs = PreviousRegSeqByUndefCount[NeededUndefs]; + if (MIs.empty()) + return false; CompatibleRSI = PreviousRegSeq[MIs.back()]; tryMergeVector(&CompatibleRSI, &RSI, RemapChan); return true; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index bee4c47..6e08aff 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2703,15 +2703,20 @@ class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, S (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) >; -let OtherPredicates = [NotHasTrue16BitInsts] in { +let True16Predicate = NotHasTrue16BitInsts in { def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; -} // end OtherPredicates = [NotHasTrue16BitInsts] +} // end True16Predicate = NotHasTrue16BitInsts + +let True16Predicate = UseRealTrue16Insts in { + def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; + def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; +} // end True16Predicate = UseRealTrue16BitInsts -let OtherPredicates = [HasTrue16BitInsts] in { +let True16Predicate = UseFakeTrue16Insts in { def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; -} // end OtherPredicates = [HasTrue16BitInsts] +} // end True16Predicate = UseFakeTrue16BitInsts def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>; @@ -3790,6 +3795,13 @@ def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse> def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; } +let True16Predicate = UseRealTrue16Insts in { +def : FPMinMaxPat<V_MINMAX_F16_t16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinMaxPat<V_MAXMIN_F16_t16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; +def : FPMinCanonMaxPat<V_MINMAX_F16_t16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinCanonMaxPat<V_MAXMIN_F16_t16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; +} + let True16Predicate = UseFakeTrue16Insts in { def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; @@ -3819,6 +3831,13 @@ def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum> def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>; } +let True16Predicate = UseRealTrue16Insts, SubtargetPredicate = isGFX12Plus in { +def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_t16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; +def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_t16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; +def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_t16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; +def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_t16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; +} + let True16Predicate = UseFakeTrue16Insts, SubtargetPredicate = isGFX12Plus in { def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index fad7e67..67bebfb3 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -976,8 +976,7 @@ struct Waitcnt { Waitcnt() = default; // Pre-gfx12 constructor. Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt) - : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt), - SampleCnt(~0u), BvhCnt(~0u), KmCnt(~0u) {} + : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt) {} // gfx12+ constructor. Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index 1e76bf7..296031e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -27,6 +27,28 @@ using namespace llvm; using namespace llvm::AMDGPU; +// Return the PAL metadata hardware shader stage name. +static const char *getStageName(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_PS: + return ".ps"; + case CallingConv::AMDGPU_VS: + return ".vs"; + case CallingConv::AMDGPU_GS: + return ".gs"; + case CallingConv::AMDGPU_ES: + return ".es"; + case CallingConv::AMDGPU_HS: + return ".hs"; + case CallingConv::AMDGPU_LS: + return ".ls"; + case CallingConv::AMDGPU_Gfx: + llvm_unreachable("Callable shader has no hardware stage"); + default: + return ".cs"; + } +} + // Read the PAL metadata from IR metadata, where it was put by the frontend. void AMDGPUPALMetadata::readFromIR(Module &M) { auto *NamedMD = M.getNamedMetadata("amdgpu.pal.metadata.msgpack"); @@ -232,8 +254,18 @@ void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) { if (isLegacy()) return; // Msgpack format. + // Entry point is updated to .entry_point_symbol and is set to the function + // name getHwStage(CC)[".entry_point_symbol"] = MsgPackDoc.getNode(Name, /*Copy=*/true); + + // Set .entry_point which is defined + // to be _amdgpu_<stage> and _amdgpu_cs for non-shader functions + SmallString<16> EPName("_amdgpu_"); + raw_svector_ostream EPNameOS(EPName); + EPNameOS << getStageName(CC) + 1; + getHwStage(CC)[".entry_point"] = + MsgPackDoc.getNode(EPNameOS.str(), /*Copy=*/true); } // Set the number of used vgprs in the metadata. This is an optional @@ -943,28 +975,6 @@ msgpack::MapDocNode AMDGPUPALMetadata::getGraphicsRegisters() { return GraphicsRegisters.getMap(); } -// Return the PAL metadata hardware shader stage name. -static const char *getStageName(CallingConv::ID CC) { - switch (CC) { - case CallingConv::AMDGPU_PS: - return ".ps"; - case CallingConv::AMDGPU_VS: - return ".vs"; - case CallingConv::AMDGPU_GS: - return ".gs"; - case CallingConv::AMDGPU_ES: - return ".es"; - case CallingConv::AMDGPU_HS: - return ".hs"; - case CallingConv::AMDGPU_LS: - return ".ls"; - case CallingConv::AMDGPU_Gfx: - llvm_unreachable("Callable shader has no hardware stage"); - default: - return ".cs"; - } -} - msgpack::DocNode &AMDGPUPALMetadata::refHwStage() { auto &N = MsgPackDoc.getRoot() |