aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
authormingmingl <mingmingl@google.com>2025-02-04 11:11:14 -0800
committermingmingl <mingmingl@google.com>2025-02-04 11:11:14 -0800
commite91747a92d27ecf799427bf563f9f64f7c4d2447 (patch)
tree7aa5a8a9170deec293e152bdf2be804399dcd612 /llvm/lib/Target/AMDGPU
parent3a8d9337d816aef41c3ca1484be8b933a71a3c46 (diff)
parent53d6e59b594639417cdbfcfa2d18cea64acb4009 (diff)
downloadllvm-users/mingmingl-llvm/spr/sdpglobalvariable.zip
llvm-users/mingmingl-llvm/spr/sdpglobalvariable.tar.gz
llvm-users/mingmingl-llvm/spr/sdpglobalvariable.tar.bz2
Merge branch 'main' into users/mingmingl-llvm/spr/sdpglobalvariableusers/mingmingl-llvm/spr/sdpglobalvariable
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp59
-rw-r--r--llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td27
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp54
6 files changed, 90 insertions, 82 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index cca9fa7..792e17e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4217,18 +4217,21 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
// trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
- if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
- SDValue BV = stripBitcast(Src.getOperand(0));
- if (BV.getOpcode() == ISD::BUILD_VECTOR &&
- BV.getValueType().getVectorNumElements() == 2) {
- SDValue SrcElt = BV.getOperand(1);
- EVT SrcEltVT = SrcElt.getValueType();
- if (SrcEltVT.isFloatingPoint()) {
- SrcElt = DAG.getNode(ISD::BITCAST, SL,
- SrcEltVT.changeTypeToInteger(), SrcElt);
+ SDValue BV = stripBitcast(Src.getOperand(0));
+ if (BV.getOpcode() == ISD::BUILD_VECTOR) {
+ EVT SrcEltVT = BV.getOperand(0).getValueType();
+ unsigned SrcEltSize = SrcEltVT.getSizeInBits();
+ unsigned BitIndex = K->getZExtValue();
+ unsigned PartIndex = BitIndex / SrcEltSize;
+
+ if (PartIndex * SrcEltSize == BitIndex &&
+ PartIndex < BV.getNumOperands()) {
+ if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
+ SDValue SrcElt =
+ DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
+ BV.getOperand(PartIndex));
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
}
-
- return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 5bfd891..09f7877 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
return 1024;
}
-// FIXME: Should we use narrower types for local/region, or account for when
-// unaligned access is legal?
Type *GCNTTIImpl::getMemcpyLoopLoweringType(
LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
if (AtomicElementSize)
return Type::getIntNTy(Context, *AtomicElementSize * 8);
- Align MinAlign = std::min(SrcAlign, DestAlign);
-
- // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
- // hardware into byte accesses. If you assume all alignments are equally
- // probable, it's more efficient on average to use short accesses for this
- // case.
- if (MinAlign == Align(2))
- return Type::getInt16Ty(Context);
-
- // Not all subtargets have 128-bit DS instructions, and we currently don't
- // form them by default.
- if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
- DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
- return FixedVectorType::get(Type::getInt32Ty(Context), 2);
- }
-
- // Global memory works best with 16-byte accesses.
+ // 16-byte accesses achieve the highest copy throughput.
// If the operation has a fixed known length that is large enough, it is
// worthwhile to return an even wider type and let legalization lower it into
- // multiple accesses, effectively unrolling the memcpy loop. Private memory
- // also hits this, although accesses may be decomposed.
+ // multiple accesses, effectively unrolling the memcpy loop.
+ // We also rely on legalization to decompose into smaller accesses for
+ // subtargets and address spaces where it is necessary.
//
// Don't unroll if Length is not a constant, since unrolling leads to worse
// performance for length values that are smaller or slightly larger than the
@@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
DestAlign, AtomicCpySize);
- Align MinAlign = std::min(SrcAlign, DestAlign);
-
- if (MinAlign != Align(2)) {
- Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
- while (RemainingBytes >= 16) {
- OpsOut.push_back(I32x4Ty);
- RemainingBytes -= 16;
- }
+ Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
+ while (RemainingBytes >= 16) {
+ OpsOut.push_back(I32x4Ty);
+ RemainingBytes -= 16;
+ }
- Type *I64Ty = Type::getInt64Ty(Context);
- while (RemainingBytes >= 8) {
- OpsOut.push_back(I64Ty);
- RemainingBytes -= 8;
- }
+ Type *I64Ty = Type::getInt64Ty(Context);
+ while (RemainingBytes >= 8) {
+ OpsOut.push_back(I64Ty);
+ RemainingBytes -= 8;
+ }
- Type *I32Ty = Type::getInt32Ty(Context);
- while (RemainingBytes >= 4) {
- OpsOut.push_back(I32Ty);
- RemainingBytes -= 4;
- }
+ Type *I32Ty = Type::getInt32Ty(Context);
+ while (RemainingBytes >= 4) {
+ OpsOut.push_back(I32Ty);
+ RemainingBytes -= 4;
}
Type *I16Ty = Type::getInt16Ty(Context);
diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index a20319e..ac11526 100644
--- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -287,10 +287,10 @@ bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
RegSeqInfo &CompatibleRSI,
std::vector<std::pair<unsigned, unsigned>> &RemapChan) {
unsigned NeededUndefs = 4 - RSI.UndefReg.size();
- if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
- return false;
std::vector<MachineInstr *> &MIs =
PreviousRegSeqByUndefCount[NeededUndefs];
+ if (MIs.empty())
+ return false;
CompatibleRSI = PreviousRegSeq[MIs.back()];
tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
return true;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bee4c47..6e08aff 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2703,15 +2703,20 @@ class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, S
(i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
>;
-let OtherPredicates = [NotHasTrue16BitInsts] in {
+let True16Predicate = NotHasTrue16BitInsts in {
def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
-} // end OtherPredicates = [NotHasTrue16BitInsts]
+} // end True16Predicate = NotHasTrue16BitInsts
+
+let True16Predicate = UseRealTrue16Insts in {
+ def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
+ def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
+} // end True16Predicate = UseRealTrue16BitInsts
-let OtherPredicates = [HasTrue16BitInsts] in {
+let True16Predicate = UseFakeTrue16Insts in {
def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
-} // end OtherPredicates = [HasTrue16BitInsts]
+} // end True16Predicate = UseFakeTrue16BitInsts
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
@@ -3790,6 +3795,13 @@ def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>
def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
}
+let True16Predicate = UseRealTrue16Insts in {
+def : FPMinMaxPat<V_MINMAX_F16_t16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F16_t16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MINMAX_F16_t16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MAXMIN_F16_t16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+}
+
let True16Predicate = UseFakeTrue16Insts in {
def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
@@ -3819,6 +3831,13 @@ def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>
def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
}
+let True16Predicate = UseRealTrue16Insts, SubtargetPredicate = isGFX12Plus in {
+def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_t16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_t16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_t16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_t16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+}
+
let True16Predicate = UseFakeTrue16Insts, SubtargetPredicate = isGFX12Plus in {
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index fad7e67..67bebfb3 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -976,8 +976,7 @@ struct Waitcnt {
Waitcnt() = default;
// Pre-gfx12 constructor.
Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt)
- : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt),
- SampleCnt(~0u), BvhCnt(~0u), KmCnt(~0u) {}
+ : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt) {}
// gfx12+ constructor.
Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index 1e76bf7..296031e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -27,6 +27,28 @@
using namespace llvm;
using namespace llvm::AMDGPU;
+// Return the PAL metadata hardware shader stage name.
+static const char *getStageName(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::AMDGPU_PS:
+ return ".ps";
+ case CallingConv::AMDGPU_VS:
+ return ".vs";
+ case CallingConv::AMDGPU_GS:
+ return ".gs";
+ case CallingConv::AMDGPU_ES:
+ return ".es";
+ case CallingConv::AMDGPU_HS:
+ return ".hs";
+ case CallingConv::AMDGPU_LS:
+ return ".ls";
+ case CallingConv::AMDGPU_Gfx:
+ llvm_unreachable("Callable shader has no hardware stage");
+ default:
+ return ".cs";
+ }
+}
+
// Read the PAL metadata from IR metadata, where it was put by the frontend.
void AMDGPUPALMetadata::readFromIR(Module &M) {
auto *NamedMD = M.getNamedMetadata("amdgpu.pal.metadata.msgpack");
@@ -232,8 +254,18 @@ void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) {
if (isLegacy())
return;
// Msgpack format.
+ // Entry point is updated to .entry_point_symbol and is set to the function
+ // name
getHwStage(CC)[".entry_point_symbol"] =
MsgPackDoc.getNode(Name, /*Copy=*/true);
+
+ // Set .entry_point which is defined
+ // to be _amdgpu_<stage> and _amdgpu_cs for non-shader functions
+ SmallString<16> EPName("_amdgpu_");
+ raw_svector_ostream EPNameOS(EPName);
+ EPNameOS << getStageName(CC) + 1;
+ getHwStage(CC)[".entry_point"] =
+ MsgPackDoc.getNode(EPNameOS.str(), /*Copy=*/true);
}
// Set the number of used vgprs in the metadata. This is an optional
@@ -943,28 +975,6 @@ msgpack::MapDocNode AMDGPUPALMetadata::getGraphicsRegisters() {
return GraphicsRegisters.getMap();
}
-// Return the PAL metadata hardware shader stage name.
-static const char *getStageName(CallingConv::ID CC) {
- switch (CC) {
- case CallingConv::AMDGPU_PS:
- return ".ps";
- case CallingConv::AMDGPU_VS:
- return ".vs";
- case CallingConv::AMDGPU_GS:
- return ".gs";
- case CallingConv::AMDGPU_ES:
- return ".es";
- case CallingConv::AMDGPU_HS:
- return ".hs";
- case CallingConv::AMDGPU_LS:
- return ".ls";
- case CallingConv::AMDGPU_Gfx:
- llvm_unreachable("Callable shader has no hardware stage");
- default:
- return ".cs";
- }
-}
-
msgpack::DocNode &AMDGPUPALMetadata::refHwStage() {
auto &N =
MsgPackDoc.getRoot()