aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/packed-fp32.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/packed-fp32.ll444
1 files changed, 222 insertions, 222 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index b0651ef..78207c2 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -340,46 +340,46 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fadd_v32_vs:
; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1]
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s16 :: v_dual_mov_b32 v33, s17
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[40:41]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[38:39]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[34:35]
@@ -395,58 +395,58 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[36:37]
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[38:39]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_v32_vs:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
-; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[36:37]
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[38:39]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[40:41]
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[42:43]
@@ -466,14 +466,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[36:37]
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[38:39]
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1]
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -1597,46 +1597,46 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fmul_v32_vs:
; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1]
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s16 :: v_dual_mov_b32 v33, s17
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[40:41]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[38:39]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[34:35]
@@ -1652,58 +1652,58 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[36:37]
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[38:39]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fmul_v32_vs:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
-; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[36:37]
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[38:39]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[40:41]
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[42:43]
@@ -1723,14 +1723,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[36:37]
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[38:39]
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1]
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -2428,46 +2428,46 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fma_v32_vs:
; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1]
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[20:21]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[22:23]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[12:13]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[14:15]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[4:5]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[6:7]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[24:25]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[26:27]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[18:19]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[40:41]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[42:43]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[48:49]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[10:11]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[12:13]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[14:15]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[44:45]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[46:47]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[38:39]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[40:41], v[40:41]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[38:39], v[38:39]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[52:53], v[52:53]
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[32:33], v[32:33]
@@ -2482,58 +2482,58 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[36:37], v[36:37]
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[38:39], v[38:39]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
-; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fma_v32_vs:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
-; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[36:37], v[36:37]
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[38:39], v[38:39]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[40:41], v[40:41]
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[42:43], v[42:43]
@@ -2553,14 +2553,14 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37]
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39]
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1]
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -3529,9 +3529,9 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
;
; GFX1250-SDAG-LABEL: fadd_fadd_fsub:
; GFX1250-SDAG: ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-SDAG-NEXT: s_add_f32 s2, s1, s3
@@ -3541,14 +3541,14 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, v2
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[4:5], v[0:1] neg_lo:[0,1] neg_hi:[0,1]
-; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_fadd_fsub:
; GFX1250-GISEL: ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -3560,7 +3560,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1]
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_subrev_f32 v3, s3, v0
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[4:5]
+; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[6:7]
; GFX1250-GISEL-NEXT: s_endpgm
bb:
%i12 = fadd <2 x float> %arg, %arg1