diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/scratch-simple.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/scratch-simple.ll | 2346 |
1 files changed, 1180 insertions, 1166 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index 7a3bff8..fb9c477 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -28,43 +28,38 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -74,49 +69,40 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -125,6 +111,19 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -139,6 +138,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -153,84 +153,70 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -238,6 +224,19 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -252,6 +251,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -266,83 +266,69 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -350,6 +336,18 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -364,6 +362,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -392,7 +392,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -414,7 +413,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -422,24 +420,23 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -447,9 +444,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -457,6 +453,9 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -475,6 +474,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -503,7 +504,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -525,7 +525,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -533,24 +532,23 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -558,9 +556,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -568,6 +565,9 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -586,6 +586,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -974,42 +976,43 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -1024,8 +1027,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -1051,43 +1053,38 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -1097,49 +1094,40 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -1148,6 +1136,19 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -1162,6 +1163,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1176,84 +1178,70 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -1261,6 +1249,19 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -1275,6 +1276,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1289,83 +1291,69 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -1373,6 +1361,18 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -1387,6 +1387,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1415,7 +1417,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -1437,7 +1438,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -1445,24 +1445,23 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -1470,9 +1469,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -1480,6 +1478,9 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -1498,6 +1499,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1526,7 +1529,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -1548,7 +1550,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -1556,24 +1557,23 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -1581,9 +1581,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -1591,6 +1590,9 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -1609,6 +1611,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1997,42 +2001,43 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -2047,8 +2052,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -2074,43 +2078,38 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -2120,49 +2119,40 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -2171,6 +2161,19 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -2185,6 +2188,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -2199,84 +2203,70 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -2284,6 +2274,19 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -2298,6 +2301,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -2312,83 +2316,69 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -2396,6 +2386,18 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -2410,6 +2412,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -2438,7 +2442,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -2460,7 +2463,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -2468,24 +2470,23 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -2493,9 +2494,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -2503,6 +2503,9 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -2521,6 +2524,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -2549,7 +2554,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -2571,7 +2575,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -2579,24 +2582,23 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -2604,9 +2606,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -2614,6 +2615,9 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -2632,6 +2636,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3020,42 +3026,43 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -3070,8 +3077,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -3097,43 +3103,38 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -3143,49 +3144,40 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -3194,6 +3186,19 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -3208,6 +3213,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3222,84 +3228,70 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -3307,6 +3299,19 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -3321,6 +3326,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3334,83 +3340,69 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -3418,6 +3410,18 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -3432,6 +3436,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3459,7 +3465,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -3481,7 +3486,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -3489,24 +3493,23 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -3514,9 +3517,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -3524,6 +3526,9 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -3542,6 +3547,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3569,7 +3576,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -3591,7 +3597,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -3599,24 +3604,23 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -3624,9 +3628,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -3634,6 +3637,9 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -3652,6 +3658,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4040,42 +4048,43 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -4090,8 +4099,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -4117,43 +4125,38 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -4163,49 +4166,40 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -4214,6 +4208,19 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -4228,6 +4235,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4242,84 +4250,70 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -4327,6 +4321,19 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -4341,6 +4348,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4354,83 +4362,69 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -4438,6 +4432,18 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -4452,6 +4458,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4479,7 +4487,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -4501,7 +4508,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -4509,24 +4515,23 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -4534,9 +4539,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -4544,6 +4548,9 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -4562,6 +4569,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4589,7 +4598,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -4611,7 +4619,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -4619,24 +4626,23 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -4644,9 +4650,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -4654,6 +4659,9 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -4672,6 +4680,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -5060,42 +5070,43 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -5110,8 +5121,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -5141,43 +5151,38 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: s_mov_b32 s11, 0xe8f000 ; SI-NEXT: s_add_u32 s8, s8, s6 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -5187,49 +5192,40 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -5238,6 +5234,19 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -5252,6 +5261,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; SI-NEXT: s_mov_b32 s2, s5 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5267,84 +5277,70 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: s_mov_b32 s11, 0xe80000 ; VI-NEXT: s_add_u32 s8, s8, s6 ; VI-NEXT: s_addc_u32 s9, s9, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -5352,6 +5348,19 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -5366,6 +5375,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; VI-NEXT: s_mov_b32 s2, s5 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5380,83 +5390,69 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -5464,6 +5460,18 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -5478,8 +5486,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 -; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5 +; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-MUBUF-NEXT: ; return to shader part epilog @@ -5491,10 +5500,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -5505,8 +5514,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -5528,7 +5535,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 @@ -5536,24 +5542,25 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 @@ -5561,9 +5568,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -5571,6 +5577,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -5589,6 +5597,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -5602,10 +5612,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -5616,8 +5626,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -5639,7 +5647,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 @@ -5647,24 +5654,25 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 @@ -5672,9 +5680,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -5682,6 +5689,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -5700,6 +5709,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -6093,10 +6104,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 @@ -6105,29 +6116,31 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -6142,8 +6155,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -6172,43 +6184,38 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: s_mov_b32 s11, 0xe8f000 ; SI-NEXT: s_add_u32 s8, s8, s6 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -6218,49 +6225,40 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -6269,6 +6267,19 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -6283,6 +6294,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; SI-NEXT: s_mov_b32 s2, s5 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6298,84 +6310,70 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: s_mov_b32 s11, 0xe80000 ; VI-NEXT: s_add_u32 s8, s8, s6 ; VI-NEXT: s_addc_u32 s9, s9, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -6383,6 +6381,19 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -6397,6 +6408,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; VI-NEXT: s_mov_b32 s2, s5 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6411,83 +6423,69 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -6495,6 +6493,18 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -6509,8 +6519,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 -; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5 +; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-MUBUF-NEXT: ; return to shader part epilog @@ -6522,10 +6533,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -6536,8 +6547,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -6559,7 +6568,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 @@ -6567,24 +6575,25 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 @@ -6592,9 +6601,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -6602,6 +6610,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -6620,6 +6630,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -6633,10 +6645,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -6647,8 +6659,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -6670,7 +6680,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 @@ -6678,24 +6687,25 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 @@ -6703,9 +6713,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -6713,6 +6722,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -6731,6 +6742,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -7124,10 +7137,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 @@ -7136,29 +7149,31 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -7173,8 +7188,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 |
