aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/packed-fp32.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/packed-fp32.ll203
1 files changed, 103 insertions, 100 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 78207c2..1177474 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -185,44 +185,47 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_f32_e32 v4, s43, v4
; GFX900-NEXT: v_add_f32_e32 v3, s42, v3
; GFX900-NEXT: v_add_f32_e32 v2, s41, v2
; GFX900-NEXT: v_add_f32_e32 v1, s40, v1
-; GFX900-NEXT: s_waitcnt vmcnt(6)
-; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
-; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
-; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_f32_e32 v32, s19, v32
; GFX900-NEXT: v_add_f32_e32 v31, s18, v31
; GFX900-NEXT: v_add_f32_e32 v30, s17, v30
; GFX900-NEXT: v_add_f32_e32 v29, s16, v29
+; GFX900-NEXT: s_waitcnt vmcnt(4)
+; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
+; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
+; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
; GFX900-NEXT: v_add_f32_e32 v5, s36, v5
+; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_add_f32_e32 v12, s51, v12
; GFX900-NEXT: v_add_f32_e32 v11, s50, v11
; GFX900-NEXT: v_add_f32_e32 v10, s49, v10
; GFX900-NEXT: v_add_f32_e32 v9, s48, v9
+; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_add_f32_e32 v16, s47, v16
; GFX900-NEXT: v_add_f32_e32 v15, s46, v15
; GFX900-NEXT: v_add_f32_e32 v14, s45, v14
; GFX900-NEXT: v_add_f32_e32 v13, s44, v13
+; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_add_f32_e32 v20, s15, v20
; GFX900-NEXT: v_add_f32_e32 v19, s14, v19
; GFX900-NEXT: v_add_f32_e32 v18, s13, v18
; GFX900-NEXT: v_add_f32_e32 v17, s12, v17
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_f32_e32 v24, s11, v24
; GFX900-NEXT: v_add_f32_e32 v23, s10, v23
; GFX900-NEXT: v_add_f32_e32 v22, s9, v22
@@ -246,6 +249,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -255,9 +260,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[40:41]
; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[42:43]
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
@@ -293,6 +296,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -302,9 +307,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[36:37]
; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[38:39]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
@@ -340,11 +343,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fadd_v32_vs:
; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_clause 0x2
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
; GFX1250-SDAG-NEXT: s_clause 0x7
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
@@ -354,22 +360,18 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
@@ -409,6 +411,9 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
@@ -421,10 +426,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT: s_clause 0x1
-; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
@@ -1442,44 +1443,47 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4
; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3
; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2
; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1
-; GFX900-NEXT: s_waitcnt vmcnt(6)
-; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
-; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
-; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32
; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31
; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30
; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29
+; GFX900-NEXT: s_waitcnt vmcnt(4)
+; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
+; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
+; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5
+; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12
; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11
; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10
; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9
+; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16
; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15
; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14
; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13
+; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20
; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19
; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18
; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24
; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23
; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22
@@ -1503,6 +1507,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -1512,9 +1518,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
@@ -1550,6 +1554,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -1559,9 +1565,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[36:37]
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[38:39]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
@@ -1597,11 +1601,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fmul_v32_vs:
; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_clause 0x2
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
; GFX1250-SDAG-NEXT: s_clause 0x7
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
@@ -1611,22 +1618,18 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
@@ -1666,6 +1669,9 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
@@ -1678,10 +1684,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT: s_clause 0x1
-; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
@@ -2273,44 +2275,47 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43
; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42
; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41
; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40
-; GFX900-NEXT: s_waitcnt vmcnt(6)
-; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
-; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
-; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19
; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18
; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17
; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16
+; GFX900-NEXT: s_waitcnt vmcnt(4)
+; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
+; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
+; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36
+; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51
; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50
; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49
; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48
+; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47
; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46
; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45
; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44
+; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15
; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14
; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13
; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11
; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10
; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9
@@ -2334,6 +2339,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -2343,9 +2350,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
@@ -2381,6 +2386,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -2390,9 +2397,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37]
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
@@ -2430,6 +2435,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
@@ -2442,10 +2450,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[40:41]
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[42:43]
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51]
@@ -2496,6 +2500,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
@@ -2508,10 +2515,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT: s_clause 0x1
-; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]