diff options
| author | Zeng Wu <zengwu13@amd.com> | 2026-04-28 21:25:29 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2026-04-28 21:25:29 -0700 |
| commit | 19a3d7b5db5ccb8b544e8ba2ffdb1d4a528e1b11 (patch) | |
| tree | c71dc2c1a8c95b767e6bef0c044a6fd1205b3c27 | |
| parent | e42c2fef2c2add3bd661149c89c9ba4b581bf76a (diff) | |
| download | llvm-19a3d7b5db5ccb8b544e8ba2ffdb1d4a528e1b11.tar.gz llvm-19a3d7b5db5ccb8b544e8ba2ffdb1d4a528e1b11.tar.bz2 llvm-19a3d7b5db5ccb8b544e8ba2ffdb1d4a528e1b11.zip | |
[AMDGPU][MC] update USER_SGPR_COUNT bits for GFX1250 (#192579)
When we work on the triton kernel with tensor descriptor created on the
host side, there is a error message `amdgpu_user_sgpr_count smaller than
than implied by enabled user SGPRs`.
After some debugging, we find the `USER_SGPR_COUNT` is not updated with
GFX125 and this patch updates it for USER_SGPR_COUNT from
https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc2-gfx6-gfx12-table.
On GFX125, COMPUTE_PGM_RSRC2::USER_SGPR_COUNT is 6 bits wide. The MC
helper S_00B84C_USER_SGPR only masks to 5 bits; when the true user SGPR
count is 32 or more, the masked value wraps (e.g. 32 -> 0).
`AMDGPUAsmPrinter` then emits a .amdhsa_user_sgpr_count with 0, that
disagrees with the implied count from enabled user SGPRs (including
kernarg preload), and finally assembling llc output with `llvm-mc` fails
in `AMDGPUAsmParser`
---------
Co-authored-by: Shilei Tian <i@tianshilei.me>
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 11 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIDefines.h | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIProgramInfo.cpp | 40 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIProgramInfo.h | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll | 21 | ||||
| -rw-r--r-- | llvm/test/MC/AMDGPU/user-sgpr-count-diag.s | 6 | ||||
| -rw-r--r-- | llvm/test/MC/AMDGPU/user-sgpr-count-gfx1250.s | 27 |
8 files changed, 92 insertions, 25 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 94ff6c2daf33..a788c1384821 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -678,7 +678,7 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF, STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx); KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx); - KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx); + KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(STM, Ctx); KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); int64_t PGM_Rsrc3 = 1; @@ -1392,7 +1392,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI( /*Size=*/4); OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); - EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4); + EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx), + /*Size=*/4); OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); @@ -1518,7 +1519,7 @@ void AMDGPUAsmPrinter::EmitPALMetadata( if (MD->getPALMajorVersion() < 3) { MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx); if (AMDGPU::isCompute(CC)) { - MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); + MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx), Ctx); } else { const MCExpr *HasScratchBlocks = MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks, @@ -1600,7 +1601,7 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { CallingConv::AMDGPU_CS, CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx); MD->setRsrc2(CallingConv::AMDGPU_CS, - CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); + CurrentProgramInfo.getComputePGMRSrc2(ST, Ctx), Ctx); } else { EmitPALMetadataCommon( MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST, @@ -1643,7 +1644,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out, Out.compute_pgm_resource1_registers = CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx); Out.compute_pgm_resource2_registers = - CurrentProgramInfo.getComputePGMRSrc2(Ctx); + CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx); Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index f19ac78abbd7..7b5738845ef2 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -6404,7 +6404,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT, getContext()); if (ExplicitUserSGPRCount && ImpliedUserSGPRCount > *ExplicitUserSGPRCount) - return TokError("amdgpu_user_sgpr_count smaller than than implied by " + return TokError("amdgpu_user_sgpr_count smaller than implied by " "enabled user SGPRs"); if (isGFX1250Plus()) { diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 5612b0df7aa9..9867f5b02d7a 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -1211,6 +1211,10 @@ enum { #define G_00B84C_EXCP_EN(x) (((x) >> 24) & 0x7F) #define C_00B84C_EXCP_EN 0x80FFFFFF +#define S_00B84C_USER_SGPR_GFX1250(x) (((x) & 0x3F) << 1) +#define G_00B84C_USER_SGPR_GFX1250(x) (((x) >> 1) & 0x3F) +#define C_00B84C_USER_SGPR_GFX1250 0xFFFFFF81 + #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC #define R_0286D0_SPI_PS_INPUT_ADDR 0x0286D0 diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index a3f261b87e80..471e2f7a3857 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -136,17 +136,27 @@ static uint64_t getPGMRSrc1Reg(const SIProgramInfo &ProgInfo, return Reg; } -static uint64_t getComputePGMRSrc2Reg(const SIProgramInfo &ProgInfo) { - uint64_t Reg = S_00B84C_USER_SGPR(ProgInfo.UserSGPR) | - S_00B84C_TRAP_HANDLER(ProgInfo.TrapHandlerEnable) | - S_00B84C_TGID_X_EN(ProgInfo.TGIdXEnable) | - S_00B84C_TGID_Y_EN(ProgInfo.TGIdYEnable) | - S_00B84C_TGID_Z_EN(ProgInfo.TGIdZEnable) | - S_00B84C_TG_SIZE_EN(ProgInfo.TGSizeEnable) | - S_00B84C_TIDIG_COMP_CNT(ProgInfo.TIdIGCompCount) | - S_00B84C_EXCP_EN_MSB(ProgInfo.EXCPEnMSB) | - S_00B84C_LDS_SIZE(ProgInfo.LdsSize) | - S_00B84C_EXCP_EN(ProgInfo.EXCPEnable); +static uint64_t getComputePGMRSrc2Reg(const GCNSubtarget &ST, + const SIProgramInfo &ProgInfo) { + uint64_t MaxNumerUserSGRPs = AMDGPU::getMaxNumUserSGPRs(ST); + uint64_t Reg = 0; + if (MaxNumerUserSGRPs == 32) { + Reg = S_00B84C_USER_SGPR_GFX1250(ProgInfo.UserSGPR); + } else if (MaxNumerUserSGRPs == 16) { + Reg = (S_00B84C_USER_SGPR(ProgInfo.UserSGPR) | + S_00B84C_TRAP_HANDLER(ProgInfo.TrapHandlerEnable)); + } else { + llvm_unreachable("max Number of User SGPRs are either 32 or 16"); + } + + Reg |= S_00B84C_TGID_X_EN(ProgInfo.TGIdXEnable) | + S_00B84C_TGID_Y_EN(ProgInfo.TGIdYEnable) | + S_00B84C_TGID_Z_EN(ProgInfo.TGIdZEnable) | + S_00B84C_TG_SIZE_EN(ProgInfo.TGSizeEnable) | + S_00B84C_TIDIG_COMP_CNT(ProgInfo.TIdIGCompCount) | + S_00B84C_EXCP_EN_MSB(ProgInfo.EXCPEnMSB) | + S_00B84C_LDS_SIZE(ProgInfo.LdsSize) | + S_00B84C_EXCP_EN(ProgInfo.EXCPEnable); return Reg; } @@ -189,16 +199,18 @@ const MCExpr *SIProgramInfo::getPGMRSrc1(CallingConv::ID CC, return MCBinaryExpr::createOr(RegExpr, Res, Ctx); } -const MCExpr *SIProgramInfo::getComputePGMRSrc2(MCContext &Ctx) const { - uint64_t Reg = getComputePGMRSrc2Reg(*this); +const MCExpr *SIProgramInfo::getComputePGMRSrc2(const GCNSubtarget &ST, + MCContext &Ctx) const { + uint64_t Reg = getComputePGMRSrc2Reg(ST, *this); const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx); return MCBinaryExpr::createOr(ScratchEnable, RegExpr, Ctx); } const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, + const GCNSubtarget &ST, MCContext &Ctx) const { if (AMDGPU::isCompute(CC)) - return getComputePGMRSrc2(Ctx); + return getComputePGMRSrc2(ST, Ctx); return MCConstantExpr::create(0, Ctx); } diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index 171c4a313a53..947b473142a1 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -117,8 +117,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { MCContext &Ctx) const; /// Compute the value of the ComputePGMRsrc2 register. - const MCExpr *getComputePGMRSrc2(MCContext &Ctx) const; - const MCExpr *getPGMRSrc2(CallingConv::ID CC, MCContext &Ctx) const; + const MCExpr *getComputePGMRSrc2(const GCNSubtarget &ST, + MCContext &Ctx) const; + const MCExpr *getPGMRSrc2(CallingConv::ID CC, const GCNSubtarget &ST, + MCContext &Ctx) const; }; } // namespace llvm diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll index 4054d4c99c2b..933f81c27934 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll @@ -1,5 +1,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefix=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=GFX1250-OBJDUMP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250-ASM %s ; OBJDUMP: Contents of section .rodata: ; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000 ................ @@ -70,4 +72,23 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r define amdgpu_kernel void @amdhsa_kernarg_preload_0_implicit_2(i32) #0 { ret void } +; GFX1250-OBJDUMP: 0100 00000000 00000000 90010000 00000000 ................ +; GFX1250-OBJDUMP: 0110 00000000 00000000 00000000 00000000 ................ +; GFX1250-OBJDUMP: 0120 00000000 00000000 00000000 10000000 ................ +; GFX1250-OBJDUMP: 0130 00000fc0 c0130000 1e041800 00000000 ................ +; GFX1250-ASM: .sgpr_count: 32 +define amdgpu_kernel void @many__i32( + i32 inreg %a0, i32 inreg %a1, i32 inreg %a2, i32 inreg %a3, + i32 inreg %a4, i32 inreg %a5, i32 inreg %a6, i32 inreg %a7, + i32 inreg %a8, i32 inreg %a9, i32 inreg %a10, i32 inreg %a11, + i32 inreg %a12, i32 inreg %a13, i32 inreg %a14, i32 inreg %a15, + i32 inreg %a16, i32 inreg %a17, i32 inreg %a18, i32 inreg %a19, + i32 inreg %a20, i32 inreg %a21, i32 inreg %a22, i32 inreg %a23, + i32 inreg %a24, i32 inreg %a25, i32 inreg %a26, i32 inreg %a27, + i32 inreg %a28, i32 inreg %a29, i32 inreg %a30, i32 inreg %a31, + i32 inreg %a32, i32 inreg %a33, i32 inreg %a34, i32 inreg %a35) { + ret void +} + + attributes #0 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s index aad219bc77d4..59e0cb93e289 100644 --- a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s +++ b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s @@ -6,7 +6,7 @@ .amdhsa_accum_offset 4 .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 -// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs +// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than implied by enabled user SGPRs .end_amdhsa_kernel .amdhsa_kernel implied_count_too_low_1 @@ -15,7 +15,7 @@ .amdhsa_accum_offset 4 .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 -// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs +// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than implied by enabled user SGPRs .end_amdhsa_kernel .amdhsa_kernel implied_count_too_low_2 @@ -25,7 +25,7 @@ .amdhsa_accum_offset 4 .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 -// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs +// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than implied by enabled user SGPRs .end_amdhsa_kernel .amdhsa_kernel preload_out_of_bounds_0 diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count-gfx1250.s b/llvm/test/MC/AMDGPU/user-sgpr-count-gfx1250.s new file mode 100644 index 000000000000..41eae10ccd18 --- /dev/null +++ b/llvm/test/MC/AMDGPU/user-sgpr-count-gfx1250.s @@ -0,0 +1,27 @@ +// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx1250 -filetype=asm %s 2>&1 | FileCheck %s + +.amdhsa_code_object_version 6 + +// CHECK:.amdhsa_user_sgpr_count 8 + .amdhsa_kernel user_sgpr_0 + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 + + .amdhsa_user_sgpr_count 8 +.end_amdhsa_kernel + +// CHECK:.amdhsa_user_sgpr_count 31 +.amdhsa_kernel user_sgpr_1 + .amdhsa_next_free_vgpr 1 + .amdhsa_next_free_sgpr 0 + + .amdhsa_user_sgpr_count 31 +.end_amdhsa_kernel + +// CHECK:.amdhsa_user_sgpr_count 32 +.amdhsa_kernel user_sgpr_2 + .amdhsa_next_free_vgpr 1 + .amdhsa_next_free_sgpr 0 + + .amdhsa_user_sgpr_count 32 + .end_amdhsa_kernel |
