aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/scratch-simple.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/scratch-simple.ll1728
1 files changed, 865 insertions, 863 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 7a3bff8..840916a 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -28,15 +28,20 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -44,26 +49,19 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -76,19 +74,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -97,8 +84,22 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -106,17 +107,16 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -153,37 +153,35 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -193,19 +191,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -213,24 +200,37 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -266,36 +266,33 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -305,26 +302,30 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -333,16 +334,15 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -392,7 +392,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -435,6 +434,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -448,8 +448,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -457,6 +455,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -503,7 +503,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -546,6 +545,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -559,8 +559,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -568,6 +566,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -974,42 +974,43 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -1024,8 +1025,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -1051,15 +1051,20 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -1067,26 +1072,19 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -1099,19 +1097,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -1120,8 +1107,22 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -1129,17 +1130,16 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -1176,37 +1176,35 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -1216,19 +1214,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -1236,24 +1223,37 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -1289,36 +1289,33 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -1328,26 +1325,30 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -1356,16 +1357,15 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -1415,7 +1415,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -1458,6 +1457,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -1471,8 +1471,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -1480,6 +1478,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -1526,7 +1526,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -1569,6 +1568,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -1582,8 +1582,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -1591,6 +1589,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -1997,42 +1997,43 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -2047,8 +2048,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -2074,15 +2074,20 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -2090,26 +2095,19 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -2122,19 +2120,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -2143,8 +2130,22 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -2152,17 +2153,16 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -2199,37 +2199,35 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -2239,19 +2237,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -2259,24 +2246,37 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -2312,36 +2312,33 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -2351,26 +2348,30 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -2379,16 +2380,15 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -2438,7 +2438,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -2481,6 +2480,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -2494,8 +2494,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -2503,6 +2501,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -2549,7 +2549,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -2592,6 +2591,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -2605,8 +2605,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -2614,6 +2612,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -3020,42 +3020,43 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -3070,8 +3071,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -3097,15 +3097,20 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -3113,26 +3118,19 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -3145,19 +3143,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -3166,8 +3153,22 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -3175,17 +3176,16 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -3222,37 +3222,35 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -3262,19 +3260,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -3282,24 +3269,37 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -3334,36 +3334,33 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -3373,26 +3370,30 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -3401,16 +3402,15 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -3459,7 +3459,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -3502,6 +3501,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -3515,8 +3515,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -3524,6 +3522,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -3569,7 +3569,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -3612,6 +3611,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -3625,8 +3625,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -3634,6 +3632,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -4040,42 +4040,43 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -4090,8 +4091,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -4117,15 +4117,20 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -4133,26 +4138,19 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -4165,19 +4163,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -4186,8 +4173,22 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -4195,17 +4196,16 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -4242,37 +4242,35 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -4282,19 +4280,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -4302,24 +4289,37 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -4354,36 +4354,33 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -4393,26 +4390,30 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -4421,16 +4422,15 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -4479,7 +4479,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -4522,6 +4521,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -4535,8 +4535,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -4544,6 +4542,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -4589,7 +4589,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -4632,6 +4631,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -4645,8 +4645,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -4654,6 +4652,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -5060,42 +5060,43 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -5110,8 +5111,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -5141,15 +5141,20 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: s_mov_b32 s11, 0xe8f000
; SI-NEXT: s_add_u32 s8, s8, s6
; SI-NEXT: s_addc_u32 s9, s9, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
@@ -5157,26 +5162,19 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -5189,19 +5187,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
@@ -5210,8 +5197,22 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
@@ -5219,17 +5220,16 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5267,37 +5267,35 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: s_mov_b32 s11, 0xe80000
; VI-NEXT: s_add_u32 s8, s8, s6
; VI-NEXT: s_addc_u32 s9, s9, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -5307,19 +5305,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
@@ -5327,24 +5314,37 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5380,36 +5380,33 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5
; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -5419,26 +5416,30 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -5447,16 +5448,15 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5491,10 +5491,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
@@ -5505,8 +5505,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -5549,6 +5547,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -5562,8 +5562,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -5571,6 +5569,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -5602,10 +5602,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
@@ -5616,8 +5616,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -5660,6 +5658,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -5673,8 +5673,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -5682,6 +5680,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -6093,10 +6093,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
@@ -6105,29 +6105,31 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -6142,8 +6144,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -6172,15 +6173,20 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: s_mov_b32 s11, 0xe8f000
; SI-NEXT: s_add_u32 s8, s8, s6
; SI-NEXT: s_addc_u32 s9, s9, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
@@ -6188,26 +6194,19 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -6220,19 +6219,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
@@ -6241,8 +6229,22 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
@@ -6250,17 +6252,16 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6298,37 +6299,35 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: s_mov_b32 s11, 0xe80000
; VI-NEXT: s_add_u32 s8, s8, s6
; VI-NEXT: s_addc_u32 s9, s9, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -6338,19 +6337,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
@@ -6358,24 +6346,37 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6411,36 +6412,33 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5
; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -6450,26 +6448,30 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -6478,16 +6480,15 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6522,10 +6523,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
@@ -6536,8 +6537,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -6580,6 +6579,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -6593,8 +6594,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -6602,6 +6601,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -6633,10 +6634,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
@@ -6647,8 +6648,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -6691,6 +6690,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -6704,8 +6705,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -6713,6 +6712,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -7124,10 +7125,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
@@ -7136,29 +7137,31 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -7173,8 +7176,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752