aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZeng Wu <zengwu13@amd.com>2026-04-28 21:25:29 -0700
committerGitHub <noreply@github.com>2026-04-28 21:25:29 -0700
commit19a3d7b5db5ccb8b544e8ba2ffdb1d4a528e1b11 (patch)
treec71dc2c1a8c95b767e6bef0c044a6fd1205b3c27
parente42c2fef2c2add3bd661149c89c9ba4b581bf76a (diff)
downloadllvm-19a3d7b5db5ccb8b544e8ba2ffdb1d4a528e1b11.tar.gz
llvm-19a3d7b5db5ccb8b544e8ba2ffdb1d4a528e1b11.tar.bz2
llvm-19a3d7b5db5ccb8b544e8ba2ffdb1d4a528e1b11.zip
[AMDGPU][MC] update USER_SGPR_COUNT bits for GFX1250 (#192579)
When we work on the triton kernel with tensor descriptor created on the host side, there is a error message `amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs`. After some debugging, we find the `USER_SGPR_COUNT` is not updated with GFX125 and this patch updates it for USER_SGPR_COUNT from https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc2-gfx6-gfx12-table. On GFX125, COMPUTE_PGM_RSRC2::USER_SGPR_COUNT is 6 bits wide. The MC helper S_00B84C_USER_SGPR only masks to 5 bits; when the true user SGPR count is 32 or more, the masked value wraps (e.g. 32 -> 0). `AMDGPUAsmPrinter` then emits a .amdhsa_user_sgpr_count with 0, that disagrees with the implied count from enabled user SGPRs (including kernarg preload), and finally assembling llc output with `llvm-mc` fails in `AMDGPUAsmParser` --------- Co-authored-by: Shilei Tian <i@tianshilei.me>
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.cpp40
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.h6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll21
-rw-r--r--llvm/test/MC/AMDGPU/user-sgpr-count-diag.s6
-rw-r--r--llvm/test/MC/AMDGPU/user-sgpr-count-gfx1250.s27
8 files changed, 92 insertions, 25 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 94ff6c2daf33..a788c1384821 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -678,7 +678,7 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
- KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
+ KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(STM, Ctx);
KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
int64_t PGM_Rsrc3 = 1;
@@ -1392,7 +1392,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(
/*Size=*/4);
OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
- EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
+ EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx),
+ /*Size=*/4);
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
@@ -1518,7 +1519,7 @@ void AMDGPUAsmPrinter::EmitPALMetadata(
if (MD->getPALMajorVersion() < 3) {
MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
if (AMDGPU::isCompute(CC)) {
- MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
+ MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx), Ctx);
} else {
const MCExpr *HasScratchBlocks =
MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
@@ -1600,7 +1601,7 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
CallingConv::AMDGPU_CS,
CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
MD->setRsrc2(CallingConv::AMDGPU_CS,
- CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
+ CurrentProgramInfo.getComputePGMRSrc2(ST, Ctx), Ctx);
} else {
EmitPALMetadataCommon(
MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
@@ -1643,7 +1644,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
Out.compute_pgm_resource1_registers =
CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
Out.compute_pgm_resource2_registers =
- CurrentProgramInfo.getComputePGMRSrc2(Ctx);
+ CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx);
Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index f19ac78abbd7..7b5738845ef2 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -6404,7 +6404,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT, getContext());
if (ExplicitUserSGPRCount && ImpliedUserSGPRCount > *ExplicitUserSGPRCount)
- return TokError("amdgpu_user_sgpr_count smaller than than implied by "
+ return TokError("amdgpu_user_sgpr_count smaller than implied by "
"enabled user SGPRs");
if (isGFX1250Plus()) {
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 5612b0df7aa9..9867f5b02d7a 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1211,6 +1211,10 @@ enum {
#define G_00B84C_EXCP_EN(x) (((x) >> 24) & 0x7F)
#define C_00B84C_EXCP_EN 0x80FFFFFF
+#define S_00B84C_USER_SGPR_GFX1250(x) (((x) & 0x3F) << 1)
+#define G_00B84C_USER_SGPR_GFX1250(x) (((x) >> 1) & 0x3F)
+#define C_00B84C_USER_SGPR_GFX1250 0xFFFFFF81
+
#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC
#define R_0286D0_SPI_PS_INPUT_ADDR 0x0286D0
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index a3f261b87e80..471e2f7a3857 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -136,17 +136,27 @@ static uint64_t getPGMRSrc1Reg(const SIProgramInfo &ProgInfo,
return Reg;
}
-static uint64_t getComputePGMRSrc2Reg(const SIProgramInfo &ProgInfo) {
- uint64_t Reg = S_00B84C_USER_SGPR(ProgInfo.UserSGPR) |
- S_00B84C_TRAP_HANDLER(ProgInfo.TrapHandlerEnable) |
- S_00B84C_TGID_X_EN(ProgInfo.TGIdXEnable) |
- S_00B84C_TGID_Y_EN(ProgInfo.TGIdYEnable) |
- S_00B84C_TGID_Z_EN(ProgInfo.TGIdZEnable) |
- S_00B84C_TG_SIZE_EN(ProgInfo.TGSizeEnable) |
- S_00B84C_TIDIG_COMP_CNT(ProgInfo.TIdIGCompCount) |
- S_00B84C_EXCP_EN_MSB(ProgInfo.EXCPEnMSB) |
- S_00B84C_LDS_SIZE(ProgInfo.LdsSize) |
- S_00B84C_EXCP_EN(ProgInfo.EXCPEnable);
+static uint64_t getComputePGMRSrc2Reg(const GCNSubtarget &ST,
+ const SIProgramInfo &ProgInfo) {
+ uint64_t MaxNumerUserSGRPs = AMDGPU::getMaxNumUserSGPRs(ST);
+ uint64_t Reg = 0;
+ if (MaxNumerUserSGRPs == 32) {
+ Reg = S_00B84C_USER_SGPR_GFX1250(ProgInfo.UserSGPR);
+ } else if (MaxNumerUserSGRPs == 16) {
+ Reg = (S_00B84C_USER_SGPR(ProgInfo.UserSGPR) |
+ S_00B84C_TRAP_HANDLER(ProgInfo.TrapHandlerEnable));
+ } else {
+ llvm_unreachable("max Number of User SGPRs are either 32 or 16");
+ }
+
+ Reg |= S_00B84C_TGID_X_EN(ProgInfo.TGIdXEnable) |
+ S_00B84C_TGID_Y_EN(ProgInfo.TGIdYEnable) |
+ S_00B84C_TGID_Z_EN(ProgInfo.TGIdZEnable) |
+ S_00B84C_TG_SIZE_EN(ProgInfo.TGSizeEnable) |
+ S_00B84C_TIDIG_COMP_CNT(ProgInfo.TIdIGCompCount) |
+ S_00B84C_EXCP_EN_MSB(ProgInfo.EXCPEnMSB) |
+ S_00B84C_LDS_SIZE(ProgInfo.LdsSize) |
+ S_00B84C_EXCP_EN(ProgInfo.EXCPEnable);
return Reg;
}
@@ -189,16 +199,18 @@ const MCExpr *SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
return MCBinaryExpr::createOr(RegExpr, Res, Ctx);
}
-const MCExpr *SIProgramInfo::getComputePGMRSrc2(MCContext &Ctx) const {
- uint64_t Reg = getComputePGMRSrc2Reg(*this);
+const MCExpr *SIProgramInfo::getComputePGMRSrc2(const GCNSubtarget &ST,
+ MCContext &Ctx) const {
+ uint64_t Reg = getComputePGMRSrc2Reg(ST, *this);
const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
return MCBinaryExpr::createOr(ScratchEnable, RegExpr, Ctx);
}
const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
+ const GCNSubtarget &ST,
MCContext &Ctx) const {
if (AMDGPU::isCompute(CC))
- return getComputePGMRSrc2(Ctx);
+ return getComputePGMRSrc2(ST, Ctx);
return MCConstantExpr::create(0, Ctx);
}
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 171c4a313a53..947b473142a1 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -117,8 +117,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
MCContext &Ctx) const;
/// Compute the value of the ComputePGMRsrc2 register.
- const MCExpr *getComputePGMRSrc2(MCContext &Ctx) const;
- const MCExpr *getPGMRSrc2(CallingConv::ID CC, MCContext &Ctx) const;
+ const MCExpr *getComputePGMRSrc2(const GCNSubtarget &ST,
+ MCContext &Ctx) const;
+ const MCExpr *getPGMRSrc2(CallingConv::ID CC, const GCNSubtarget &ST,
+ MCContext &Ctx) const;
};
} // namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index 4054d4c99c2b..933f81c27934 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -1,5 +1,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefix=ASM %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=GFX1250-OBJDUMP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250-ASM %s
; OBJDUMP: Contents of section .rodata:
; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000 ................
@@ -70,4 +72,23 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r
define amdgpu_kernel void @amdhsa_kernarg_preload_0_implicit_2(i32) #0 { ret void }
+; GFX1250-OBJDUMP: 0100 00000000 00000000 90010000 00000000 ................
+; GFX1250-OBJDUMP: 0110 00000000 00000000 00000000 00000000 ................
+; GFX1250-OBJDUMP: 0120 00000000 00000000 00000000 10000000 ................
+; GFX1250-OBJDUMP: 0130 00000fc0 c0130000 1e041800 00000000 ................
+; GFX1250-ASM: .sgpr_count: 32
+define amdgpu_kernel void @many__i32(
+ i32 inreg %a0, i32 inreg %a1, i32 inreg %a2, i32 inreg %a3,
+ i32 inreg %a4, i32 inreg %a5, i32 inreg %a6, i32 inreg %a7,
+ i32 inreg %a8, i32 inreg %a9, i32 inreg %a10, i32 inreg %a11,
+ i32 inreg %a12, i32 inreg %a13, i32 inreg %a14, i32 inreg %a15,
+ i32 inreg %a16, i32 inreg %a17, i32 inreg %a18, i32 inreg %a19,
+ i32 inreg %a20, i32 inreg %a21, i32 inreg %a22, i32 inreg %a23,
+ i32 inreg %a24, i32 inreg %a25, i32 inreg %a26, i32 inreg %a27,
+ i32 inreg %a28, i32 inreg %a29, i32 inreg %a30, i32 inreg %a31,
+ i32 inreg %a32, i32 inreg %a33, i32 inreg %a34, i32 inreg %a35) {
+ ret void
+}
+
+
attributes #0 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
index aad219bc77d4..59e0cb93e289 100644
--- a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
+++ b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
@@ -6,7 +6,7 @@
.amdhsa_accum_offset 4
.amdhsa_next_free_vgpr 32
.amdhsa_next_free_sgpr 32
-// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs
+// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than implied by enabled user SGPRs
.end_amdhsa_kernel
.amdhsa_kernel implied_count_too_low_1
@@ -15,7 +15,7 @@
.amdhsa_accum_offset 4
.amdhsa_next_free_vgpr 32
.amdhsa_next_free_sgpr 32
-// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs
+// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than implied by enabled user SGPRs
.end_amdhsa_kernel
.amdhsa_kernel implied_count_too_low_2
@@ -25,7 +25,7 @@
.amdhsa_accum_offset 4
.amdhsa_next_free_vgpr 32
.amdhsa_next_free_sgpr 32
-// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs
+// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than implied by enabled user SGPRs
.end_amdhsa_kernel
.amdhsa_kernel preload_out_of_bounds_0
diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count-gfx1250.s b/llvm/test/MC/AMDGPU/user-sgpr-count-gfx1250.s
new file mode 100644
index 000000000000..41eae10ccd18
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/user-sgpr-count-gfx1250.s
@@ -0,0 +1,27 @@
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx1250 -filetype=asm %s 2>&1 | FileCheck %s
+
+.amdhsa_code_object_version 6
+
+// CHECK:.amdhsa_user_sgpr_count 8
+ .amdhsa_kernel user_sgpr_0
+ .amdhsa_next_free_vgpr 0
+ .amdhsa_next_free_sgpr 0
+
+ .amdhsa_user_sgpr_count 8
+.end_amdhsa_kernel
+
+// CHECK:.amdhsa_user_sgpr_count 31
+.amdhsa_kernel user_sgpr_1
+ .amdhsa_next_free_vgpr 1
+ .amdhsa_next_free_sgpr 0
+
+ .amdhsa_user_sgpr_count 31
+.end_amdhsa_kernel
+
+// CHECK:.amdhsa_user_sgpr_count 32
+.amdhsa_kernel user_sgpr_2
+ .amdhsa_next_free_vgpr 1
+ .amdhsa_next_free_sgpr 0
+
+ .amdhsa_user_sgpr_count 32
+ .end_amdhsa_kernel