aboutsummaryrefslogtreecommitdiff
path: root/llvm/test
diff options
context:
space:
mode:
authorChangpeng Fang <changpeng.fang@amd.com>2025-08-06 11:47:37 -0700
committerGitHub <noreply@github.com>2025-08-06 11:47:37 -0700
commit32161e9de36a747dde22a06c1c99a6091eb2920b (patch)
treee880069ffc6685622c5bcdab8da7c20d6527c459 /llvm/test
parent35bd40d321ccb2e646c112418ef32318dd0e040b (diff)
downloadllvm-main.zip
llvm-main.tar.gz
llvm-main.tar.bz2
[AMDGPU] Do not fold an immediate into instructions with frame indexes (#151263)HEADmain
Do not fold an immediate into an instruction that already has a frame index operand. A frame index could possibly turn out to be another immediate. Fixes: SWDEV-536263 --------- Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll60
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-scratch.ll31
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir3
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir3
-rw-r--r--llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll27
-rw-r--r--llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll108
8 files changed, 215 insertions, 46 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index a066b15..e6a8bac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -1917,8 +1917,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3e80
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: s_movk_i32 s0, 0x3e84
+; GFX9-NEXT: s_add_i32 s0, s0, 4
; GFX9-NEXT: scratch_store_dword off, v0, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -1933,7 +1934,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_movk_i32 s0, 0x3e84
+; GFX10-NEXT: s_movk_i32 s0, 0x3e80
+; GFX10-NEXT: s_add_i32 s0, s0, 4
; GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -1945,10 +1947,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX942-LABEL: store_load_large_imm_offset_kernel:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: v_mov_b32_e32 v0, 13
+; GFX942-NEXT: s_movk_i32 s0, 0x3e80
; GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 15
-; GFX942-NEXT: s_movk_i32 s0, 0x3e84
+; GFX942-NEXT: s_add_i32 s0, s0, 4
; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -1958,7 +1961,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX11-LABEL: store_load_large_imm_offset_kernel:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT: s_movk_i32 s0, 0x3e84
+; GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, 4
; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -1986,8 +1991,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 0
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2002,7 +2008,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2014,10 +2021,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_kernel:
; UNALIGNED_GFX942: ; %bb.0: ; %bb
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2027,7 +2035,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX11-LABEL: store_load_large_imm_offset_kernel:
; UNALIGNED_GFX11: ; %bb.0: ; %bb
; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -2061,11 +2071,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-LABEL: store_load_large_imm_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3e80
; GFX9-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-NEXT: s_add_i32 s1, s32, s0
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX9-NEXT: s_add_i32 s0, s1, 4
; GFX9-NEXT: scratch_store_dword off, v0, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2076,8 +2088,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-NEXT: s_movk_i32 s0, 0x3e80
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX10-NEXT: s_add_i32 s1, s32, s0
+; GFX10-NEXT: s_add_i32 s0, s1, 4
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2089,11 +2103,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX942-LABEL: store_load_large_imm_offset_foo:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_movk_i32 s0, 0x3e80
; GFX942-NEXT: v_mov_b32_e32 v0, 13
+; GFX942-NEXT: s_add_i32 s1, s32, s0
; GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 15
-; GFX942-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX942-NEXT: s_add_i32 s0, s1, 4
; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2104,7 +2120,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s1, s32, s0
+; GFX11-NEXT: s_add_i32 s0, s1, 4
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -2133,11 +2152,13 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX9-LABEL: store_load_large_imm_offset_foo:
; UNALIGNED_GFX9: ; %bb.0: ; %bb
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX9-NEXT: s_add_i32 s1, s32, s0
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2148,8 +2169,10 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX10: ; %bb.0: ; %bb
; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX10-NEXT: s_add_i32 s1, s32, s0
+; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2161,11 +2184,13 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_foo:
; UNALIGNED_GFX942: ; %bb.0: ; %bb
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX942-NEXT: s_add_i32 s1, s32, s0
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2176,7 +2201,10 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX11: ; %bb.0: ; %bb
; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; UNALIGNED_GFX11-NEXT: s_add_i32 s1, s32, s0
+; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index b25d9b2..fc88839 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -3621,7 +3621,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_movk_i32 s0, 0x3004
+; GFX9-NEXT: s_movk_i32 s0, 0x3000
+; GFX9-NEXT: s_add_i32 s0, s0, 4
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -3637,7 +3638,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_movk_i32 s0, 0x3804
+; GFX10-NEXT: s_movk_i32 s0, 0x3800
+; GFX10-NEXT: s_add_i32 s0, s0, 4
; GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3682,7 +3684,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3004
+; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
+; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
@@ -3716,8 +3719,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13
; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800
; GFX1010-PAL-NEXT: s_mov_b32 s1, 0
-; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3804
+; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s1 offset:4
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3739,7 +3743,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13
; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15
-; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3804
+; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800
+; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3785,10 +3790,12 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-LABEL: store_load_large_imm_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3000
; GFX9-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-NEXT: s_add_i32 s1, s32, s0
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_add_i32 s0, s32, 0x3004
+; GFX9-NEXT: s_add_i32 s0, s1, 4
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -3800,8 +3807,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-NEXT: s_movk_i32 s0, 0x3800
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_add_i32 s0, s32, 0x3804
+; GFX10-NEXT: s_add_i32 s1, s32, s0
+; GFX10-NEXT: s_add_i32 s0, s1, 4
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3843,10 +3852,12 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
; GFX9-PAL: ; %bb.0: ; %bb
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-PAL-NEXT: s_add_i32 s1, s32, s0
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x3004
+; GFX9-PAL-NEXT: s_add_i32 s0, s1, 4
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
@@ -3872,8 +3883,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10-PAL: ; %bb.0: ; %bb
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x3804
+; GFX10-PAL-NEXT: s_add_i32 s1, s32, s0
+; GFX10-PAL-NEXT: s_add_i32 s0, s1, 4
; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
index 7fad2f4..a88b1ec 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
@@ -75,7 +75,8 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_materializedconst_0
- ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 256, implicit-def $scc
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 256
+ ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]], implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 %stack.0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
index cc43142..2f2d727 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
@@ -46,7 +46,8 @@ body: |
%2:sreg_32 = S_LSHL2_ADD_U32 %0, %1, implicit-def $scc
...
# GCN-LABEL: name: test_frameindex{{$}}
-# GCN: %1:sreg_32 = S_ADD_I32 %stack.0, 70
+# GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 70
+# GCN-NEXT: %1:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]]
---
name: test_frameindex
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 15cda62..f2fe61f 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -360,7 +360,8 @@ entry:
; s_add_i32.
; GCN-LABEL: {{^}}fi_sop2_s_add_u32_literal_error:
-; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0, 0x2010
+; GCN: s_movk_i32 [[S_MOVK_I32_:s[0-9]+]], 0x1000
+; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0x1010, [[S_MOVK_I32_]]
; GCN: s_addc_u32 [[ADD_HI:s[0-9]+]], s{{[0-9]+}}, 0
define amdgpu_kernel void @fi_sop2_s_add_u32_literal_error() #0 {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
index 1c298014..3001248 100644
--- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
@@ -6,16 +6,24 @@ define amdgpu_gfx [13 x i32] @issue130120() {
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: s_add_i32 s0, s32, 0xf0
-; CHECK-NEXT: s_add_i32 s1, s32, 0xf4
-; CHECK-NEXT: s_add_i32 s2, s32, 0xf8
-; CHECK-NEXT: s_add_i32 s3, s32, 0xfc
+; CHECK-NEXT: s_movk_i32 s1, 0xf4
+; CHECK-NEXT: s_movk_i32 s2, 0xf8
+; CHECK-NEXT: s_movk_i32 s3, 0xfc
+; CHECK-NEXT: s_movk_i32 s34, 0x100
; CHECK-NEXT: v_mov_b32_e32 v1, v0
-; CHECK-NEXT: s_add_i32 s34, s32, 0x100
-; CHECK-NEXT: s_add_i32 s35, s32, 0x104
-; CHECK-NEXT: s_add_i32 s36, s32, 0x108
-; CHECK-NEXT: s_add_i32 s37, s32, 0x110
-; CHECK-NEXT: s_add_i32 s38, s32, 0x120
+; CHECK-NEXT: s_movk_i32 s35, 0x104
+; CHECK-NEXT: s_movk_i32 s36, 0x108
+; CHECK-NEXT: s_movk_i32 s37, 0x110
+; CHECK-NEXT: s_movk_i32 s38, 0x120
+; CHECK-NEXT: s_add_i32 s0, s32, 0xf0
+; CHECK-NEXT: s_add_i32 s1, s32, s1
+; CHECK-NEXT: s_add_i32 s2, s32, s2
+; CHECK-NEXT: s_add_i32 s3, s32, s3
+; CHECK-NEXT: s_add_i32 s34, s32, s34
+; CHECK-NEXT: s_add_i32 s35, s32, s35
+; CHECK-NEXT: s_add_i32 s36, s32, s36
+; CHECK-NEXT: s_add_i32 s37, s32, s37
+; CHECK-NEXT: s_add_i32 s38, s32, s38
; CHECK-NEXT: s_or_b32 s39, s32, 4
; CHECK-NEXT: s_or_b32 s40, s32, 8
; CHECK-NEXT: s_or_b32 s41, s32, 12
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index a3ebaec..5f0ca7b 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -74,7 +74,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) {
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1
; FLATSCR-NEXT: ; %bb.2: ; %split
-; FLATSCR-NEXT: s_movk_i32 s0, 0x5000
+; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
+; FLATSCR-NEXT: s_addk_i32 s0, 0x3000
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:208 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_movk_i32 s0, 0x3000
@@ -175,7 +176,9 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_1
; FLATSCR-NEXT: ; %bb.2: ; %split
-; FLATSCR-NEXT: s_add_i32 s0, s33, 0x5000
+; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
+; FLATSCR-NEXT: s_add_i32 s1, s33, s0
+; FLATSCR-NEXT: s_add_i32 s0, s1, 0x3000
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000
@@ -223,30 +226,35 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1
; MUBUF-NEXT: ; %bb.2: ; %split
+; MUBUF-NEXT: s_movk_i32 s5, 0x12d4
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
-; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d4, v1
+; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1
+; MUBUF-NEXT: s_movk_i32 s5, 0x12d0
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
; MUBUF-NEXT: s_movk_i32 s4, 0x4000
; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d0, v1
+; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1
+; MUBUF-NEXT: s_movk_i32 s5, 0x12c4
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
; MUBUF-NEXT: s_or_b32 s4, s4, 0x12c0
; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c4, v1
-; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000
+; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_mov_b32_e32 v0, s4
-; MUBUF-NEXT: v_or_b32_e32 v2, 0x12cc, v3
+; MUBUF-NEXT: s_movk_i32 s4, 0x12cc
+; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000
+; MUBUF-NEXT: v_or_b32_e32 v2, s4, v3
+; MUBUF-NEXT: s_movk_i32 s4, 0x12c8
; MUBUF-NEXT: v_mov_b32_e32 v6, 0x4000
; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_mov_b32_e32 v7, 0x4000
; MUBUF-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v2, 0x12c8, v6
+; MUBUF-NEXT: v_or_b32_e32 v2, s4, v6
; MUBUF-NEXT: v_mov_b32_e32 v8, 0x4000
; MUBUF-NEXT: v_mov_b32_e32 v9, 0x4000
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen glc
@@ -298,7 +306,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1
; FLATSCR-NEXT: ; %bb.2: ; %split
-; FLATSCR-NEXT: s_movk_i32 s0, 0x3000
+; FLATSCR-NEXT: s_movk_i32 s0, 0x1000
+; FLATSCR-NEXT: s_addk_i32 s0, 0x2000
; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc
diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
new file mode 100644
index 0000000..6d0aa1e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<4 x i64> %val4, <16 x i64> %val16) {
+; CHECK-LABEL: no_folding_imm_to_inst_with_fi:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: s_load_b256 s[36:43], s[4:5], 0x24
+; CHECK-NEXT: s_load_b512 s[16:31], s[4:5], 0xe4
+; CHECK-NEXT: s_load_b512 s[0:15], s[4:5], 0xa4
+; CHECK-NEXT: s_mov_b64 s[34:35], src_private_base
+; CHECK-NEXT: s_movk_i32 s33, 0x70
+; CHECK-NEXT: s_movk_i32 s34, 0x60
+; CHECK-NEXT: s_or_b32 s44, 0x80, s33
+; CHECK-NEXT: s_mov_b32 s45, s35
+; CHECK-NEXT: s_or_b32 s46, 0x80, s34
+; CHECK-NEXT: s_mov_b32 s47, s35
+; CHECK-NEXT: v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v21, s45
+; CHECK-NEXT: v_dual_mov_b32 v22, s46 :: v_dual_mov_b32 v23, s47
+; CHECK-NEXT: s_movk_i32 s34, 0x80
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT: v_dual_mov_b32 v34, s34 :: v_dual_mov_b32 v35, s35
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v1, s41
+; CHECK-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; CHECK-NEXT: v_dual_mov_b32 v4, s36 :: v_dual_mov_b32 v5, s37
+; CHECK-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v7, s39
+; CHECK-NEXT: scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21
+; CHECK-NEXT: s_movk_i32 s20, 0x50
+; CHECK-NEXT: v_dual_mov_b32 v8, s28 :: v_dual_mov_b32 v9, s29
+; CHECK-NEXT: v_dual_mov_b32 v10, s30 :: v_dual_mov_b32 v11, s31
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_or_b32 s20, 0x80, s20
+; CHECK-NEXT: s_mov_b32 s21, s35
+; CHECK-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; CHECK-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; CHECK-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s23
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: v_dual_mov_b32 v25, s21 :: v_dual_mov_b32 v24, s20
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], off scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[22:23], v[12:15] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[24:25], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17
+; CHECK-NEXT: s_or_b32 s16, 0x80, 64
+; CHECK-NEXT: s_mov_b32 s17, s35
+; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
+; CHECK-NEXT: s_or_b32 s12, 0x80, 48
+; CHECK-NEXT: s_mov_b32 s13, s35
+; CHECK-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; CHECK-NEXT: s_or_b32 s8, 0x80, 32
+; CHECK-NEXT: s_mov_b32 s9, s35
+; CHECK-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5
+; CHECK-NEXT: s_or_b32 s4, 0x80, 16
+; CHECK-NEXT: s_mov_b32 s5, s35
+; CHECK-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: v_dual_mov_b32 v27, s17 :: v_dual_mov_b32 v26, s16
+; CHECK-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
+; CHECK-NEXT: v_dual_mov_b32 v29, s13 :: v_dual_mov_b32 v28, s12
+; CHECK-NEXT: v_dual_mov_b32 v31, s9 :: v_dual_mov_b32 v30, s8
+; CHECK-NEXT: v_dual_mov_b32 v33, s5 :: v_dual_mov_b32 v32, s4
+; CHECK-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; CHECK-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7
+; CHECK-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
+; CHECK-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
+; CHECK-NEXT: flat_store_b128 v[26:27], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[28:29], v[4:7] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[30:31], v[8:11] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[32:33], v[12:15] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[34:35], v[16:19] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[22:23] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[24:25] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[30:31] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[28:29] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[34:35] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[32:33] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: s_endpgm
+bb:
+ %alloca = alloca <4 x i64>, align 32, addrspace(5)
+ %alloca1 = alloca <16 x i64>, align 128, addrspace(5)
+ store volatile <4 x i64> %val4, ptr addrspace(5) %alloca
+ %ascast = addrspacecast ptr addrspace(5) %alloca1 to ptr
+ store volatile <16 x i64> %val16, ptr %ascast
+ %load = load volatile <16 x i64>, ptr %ascast
+ ret void
+}