; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-MUBUF %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-FLATSCR %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-MUBUF %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-FLATSCR %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s ; This test checks memory addresses with constant offset components that should ; not be folded into memory accesses with immediate offsets. ; SeparateConstOffsetsFromGEP transforms the GEPs in a way that can lead to ; out-of-bounds or negative intermediate results in the address computation, ; which are problematic for flat and scratch instructions: ; gep[inbounds](p, i + 3) -> gep(gep(p, i), 3) ; FIXME the offset here should not be folded: if %p points to the beginning of ; scratch or LDS and %i is -1, a folded offset crashes the program. define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX90A-LABEL: flat_offset_maybe_oob: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_offset_maybe_oob: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_offset_maybe_oob: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] ; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_offset_maybe_oob: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: flat_offset_maybe_oob: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %idx = add nsw i32 %i, 3 %arrayidx = getelementptr inbounds i32, ptr %p, i32 %idx %l = load i32, ptr %arrayidx ret i32 %l } ; For MUBUF and for GFX12, folding the offset is okay. define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) { ; GFX90A-MUBUF-LABEL: private_offset_maybe_oob: ; GFX90A-MUBUF: ; %bb.0: ; GFX90A-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0 ; GFX90A-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 ; GFX90A-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX90A-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-FLATSCR-LABEL: private_offset_maybe_oob: ; GFX90A-FLATSCR: ; %bb.0: ; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX90A-FLATSCR-NEXT: v_add3_u32 v0, v0, v1, 12 ; GFX90A-FLATSCR-NEXT: scratch_load_dword v0, v0, off ; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX90A-FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-MUBUF-LABEL: private_offset_maybe_oob: ; GFX10-MUBUF: ; %bb.0: ; GFX10-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0 ; GFX10-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 ; GFX10-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLATSCR-LABEL: private_offset_maybe_oob: ; GFX10-FLATSCR: ; %bb.0: ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX10-FLATSCR-NEXT: v_add3_u32 v0, v0, v1, 12 ; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v0, off ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: private_offset_maybe_oob: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX942-NEXT: v_add3_u32 v0, v0, v1, 12 ; GFX942-NEXT: scratch_load_dword v0, v0, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: private_offset_maybe_oob: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v0, v0, v1, 12 ; GFX11-NEXT: scratch_load_b32 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: private_offset_maybe_oob: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshl_add_u32 v0, v1, 2, v0 ; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:12 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %idx = add nsw i32 %i, 3 %arrayidx = getelementptr inbounds i32, ptr addrspace(5) %p, i32 %idx %l = load i32, ptr addrspace(5) %arrayidx ret i32 %l }