diff options
| author | Ruiling Song <ruiling.song@amd.com> | 2026-02-06 14:06:27 +0800 |
|---|---|---|
| committer | Ruiling Song <ruiling.song@amd.com> | 2026-02-06 14:06:27 +0800 |
| commit | 22fb226a645e574acfce7def4abfea6096979eab (patch) | |
| tree | f2a7afd88a6e5a414e9282ba64cb938d84461035 | |
| parent | 198baf1b8fbda92d0d42c7abe6fa21c8247c029d (diff) | |
| download | llvm-users/ruiling/add-test.zip llvm-users/ruiling/add-test.tar.gz llvm-users/ruiling/add-test.tar.bz2 | |
[AMDGPU] Add more testsusers/ruiling/add-test
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll | 111 |
1 files changed, 69 insertions, 42 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll b/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll index a1e229d0..21ca254 100644 --- a/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll @@ -1,48 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-GISEL %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s -define amdgpu_ps <2 x float> @global_load_saddr_offset_imm(ptr addrspace(1) inreg %sbase) { -; %voffset = load i32, ptr addrspace(1) %voffset.ptr -; GFX12-SDAG-LABEL: global_load_saddr_offset_imm: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 -; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: ; return to shader part epilog +; The address calculation can be simplified and folded because of known bits of mbcnt. +define amdgpu_ps <2 x float> @global_load_scale_add_foldable_knownbits(ptr addrspace(1) inreg %sbase) { +; GFX12-LABEL: global_load_scale_add_foldable_knownbits: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog ; -; GFX1250-SDAG-LABEL: global_load_saddr_offset_imm: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 -; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_offset_imm: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 -; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: global_load_saddr_offset_imm: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_scale_add_foldable_knownbits: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 +; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %v = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %mul = shl i32 %v, 1 %add = add i32 %mul, 32 @@ -52,5 +33,51 @@ define amdgpu_ps <2 x float> @global_load_saddr_offset_imm(ptr addrspace(1) inre ret <2 x float> %load } +; The nsw/nuw helps the folding of address addition. +define amdgpu_ps <2 x float> @global_load_scale_add_foldable_nowrap(ptr addrspace(1) inreg %sbase, i32 %v) { +; GFX12-LABEL: global_load_scale_add_foldable_nowrap: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:128 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: global_load_scale_add_foldable_nowrap: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:128 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %mul = shl nsw nuw i32 %v, 3 + %add = add nsw nuw i32 %mul, 128 + %zext.offset = zext i32 %add to i64 + %gep = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset + %load = load <2 x float>, ptr addrspace(1) %gep + ret <2 x float> %load +} + +; Address calculation cannot be folded because possible overflow during addition. +define amdgpu_ps <2 x float> @global_load_scale_add_unfoldable(ptr addrspace(1) inreg %sbase, i32 %v) { +; GFX12-LABEL: global_load_scale_add_unfoldable: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: global_load_scale_add_unfoldable: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 +; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %mul = shl i32 %v, 3 + %add = add i32 %mul, 128 + %zext.offset = zext i32 %add to i64 + %gep = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset + %load = load <2 x float>, ptr addrspace(1) %gep + ret <2 x float> %load +} declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) |
