diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2025-07-25 19:03:52 +0900 |
---|---|---|
committer | Matt Arsenault <arsenm2@gmail.com> | 2025-09-18 11:06:21 +0900 |
commit | 9be83ee93251a08500b826cb87555692b8886dd7 (patch) | |
tree | 198e39686db16bd959c8a4cb61c8b7b95a1cdfd3 | |
parent | 8c778b6dbe34f5db5f28730653d81aabc18430fd (diff) | |
download | llvm-users/arsenm/amdgpu/select-vgpr-mfma-by-default.zip llvm-users/arsenm/amdgpu/select-vgpr-mfma-by-default.tar.gz llvm-users/arsenm/amdgpu/select-vgpr-mfma-by-default.tar.bz2 |
AMDGPU: Select VGPR MFMAs by defaultusers/arsenm/amdgpu/select-vgpr-mfma-by-default
AGPRs are undesirable since they are only usable by a
handful instructions like loads, stores and mfmas and everything
else requires copies to/from VGPRs. Using the AGPR form should be
a measure of last resort if we must use more than 256 VGPRs.
28 files changed, 11169 insertions, 10914 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 908d856..0077c69 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -37,7 +37,7 @@ static cl::opt<bool> MFMAVGPRForm( "amdgpu-mfma-vgpr-form", cl::Hidden, cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), - cl::init(false)); + cl::init(true)); const GCNTargetMachine &getTM(const GCNSubtarget *STI) { const SITargetLowering *TLI = STI->getTargetLowering(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll index 5720b88..2493065 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -15,59 +15,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[36:37], 1 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] -; GCN-NEXT: s_mov_b32 s38, 2 -; GCN-NEXT: s_mov_b32 s39, s37 +; GCN-NEXT: v_pk_mov_b32 v[32:33], s[36:37], s[36:37] op_sel:[0,1] +; GCN-NEXT: s_mov_b32 s36, 2 +; GCN-NEXT: v_pk_mov_b32 v[34:35], s[36:37], s[36:37] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 ; GCN-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a16, s16 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_accvgpr_write_b32 a4, s4 -; GCN-NEXT: v_accvgpr_write_b32 a5, s5 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 -; GCN-NEXT: v_accvgpr_write_b32 a7, s7 -; GCN-NEXT: v_accvgpr_write_b32 a8, s8 -; GCN-NEXT: v_accvgpr_write_b32 a9, s9 -; GCN-NEXT: v_accvgpr_write_b32 a10, s10 -; GCN-NEXT: v_accvgpr_write_b32 a11, s11 -; GCN-NEXT: v_accvgpr_write_b32 a12, s12 -; GCN-NEXT: v_accvgpr_write_b32 a13, s13 -; GCN-NEXT: v_accvgpr_write_b32 a14, s14 -; GCN-NEXT: v_accvgpr_write_b32 a15, s15 -; GCN-NEXT: v_accvgpr_write_b32 a17, s17 -; GCN-NEXT: v_accvgpr_write_b32 a18, s18 -; GCN-NEXT: v_accvgpr_write_b32 a19, s19 -; GCN-NEXT: v_accvgpr_write_b32 a20, s20 -; GCN-NEXT: v_accvgpr_write_b32 a21, s21 -; GCN-NEXT: v_accvgpr_write_b32 a22, s22 -; GCN-NEXT: v_accvgpr_write_b32 a23, s23 -; GCN-NEXT: v_accvgpr_write_b32 a24, s24 -; GCN-NEXT: v_accvgpr_write_b32 a25, s25 -; GCN-NEXT: v_accvgpr_write_b32 a26, s26 -; GCN-NEXT: v_accvgpr_write_b32 a27, s27 -; GCN-NEXT: v_accvgpr_write_b32 a28, s28 -; GCN-NEXT: v_accvgpr_write_b32 a29, s29 -; GCN-NEXT: v_accvgpr_write_b32 a30, s30 -; GCN-NEXT: v_accvgpr_write_b32 a31, s31 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[16:17], s[16:17], s[16:17] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[20:21], s[20:21], s[20:21] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[22:23], s[22:23], s[22:23] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[24:25], s[24:25], s[24:25] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[26:27], s[26:27], s[26:27] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[28:29], s[28:29], s[28:29] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[30:31], s[30:31], s[30:31] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k v[0:31], v[32:33], v[34:35], v[0:31] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GCN-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GCN-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GCN-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GCN-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 ; GCN-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -83,36 +66,28 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) # ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[18:19], 1 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[16:17], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s18, 2 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_accvgpr_write_b32 a4, s4 -; GCN-NEXT: v_accvgpr_write_b32 a5, s5 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 -; GCN-NEXT: v_accvgpr_write_b32 a7, s7 -; GCN-NEXT: v_accvgpr_write_b32 a8, s8 -; GCN-NEXT: v_accvgpr_write_b32 a9, s9 -; GCN-NEXT: v_accvgpr_write_b32 a10, s10 -; GCN-NEXT: v_accvgpr_write_b32 a11, s11 -; GCN-NEXT: v_accvgpr_write_b32 a12, s12 -; GCN-NEXT: v_accvgpr_write_b32 a13, s13 -; GCN-NEXT: v_accvgpr_write_b32 a14, s14 -; GCN-NEXT: v_accvgpr_write_b32 a15, s15 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_nop 9 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GCN-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg @@ -128,21 +103,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[4:5], 1 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s4, 2 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_4x4x4bf16_1k v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -158,37 +131,29 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) # ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[18:19], 1 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[16:17], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s18, 2 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_accvgpr_write_b32 a4, s4 -; GCN-NEXT: v_accvgpr_write_b32 a5, s5 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 -; GCN-NEXT: v_accvgpr_write_b32 a7, s7 -; GCN-NEXT: v_accvgpr_write_b32 a8, s8 -; GCN-NEXT: v_accvgpr_write_b32 a9, s9 -; GCN-NEXT: v_accvgpr_write_b32 a10, s10 -; GCN-NEXT: v_accvgpr_write_b32 a11, s11 -; GCN-NEXT: v_accvgpr_write_b32 a12, s12 -; GCN-NEXT: v_accvgpr_write_b32 a13, s13 -; GCN-NEXT: v_accvgpr_write_b32 a14, s14 -; GCN-NEXT: v_accvgpr_write_b32 a15, s15 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GCN-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg @@ -204,21 +169,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[4:5], 1 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s4, 2 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_nop 9 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -238,12 +201,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0 +; GCN-NEXT: v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1] +; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm bb: %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0) @@ -258,25 +221,21 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[8:9], s[10:11], s[10:11] op_sel:[0,1] ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_accvgpr_write_b32 a4, s4 -; GCN-NEXT: v_accvgpr_write_b32 a5, s5 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 -; GCN-NEXT: v_accvgpr_write_b32 a7, s7 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16 ; GCN-NEXT: s_endpgm bb: %in.1 = load <4 x double>, ptr addrspace(1) %arg @@ -291,16 +250,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) % ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0 -; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 0 +; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GCN-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, i32 0, i32 0, i32 0) @@ -312,28 +271,26 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 +; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b64 s[6:7], 1.0 -; GCN-NEXT: s_mov_b64 s[8:9], 0 -; GCN-NEXT: v_accvgpr_write_b32 a0, s8 +; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GCN-NEXT: v_accvgpr_write_b32 a2, s8 -; GCN-NEXT: v_accvgpr_write_b32 a4, s8 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 -; GCN-NEXT: v_accvgpr_write_b32 a1, s9 -; GCN-NEXT: v_accvgpr_write_b32 a3, s9 -; GCN-NEXT: v_accvgpr_write_b32 a5, s9 -; GCN-NEXT: v_accvgpr_write_b32 a7, s7 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[8:9], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16 ; GCN-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0) @@ -344,28 +301,27 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0x405ec000 -; GCN-NEXT: v_accvgpr_write_b32 a0, s6 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s1, 0x405ec000 +; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GCN-NEXT: v_accvgpr_write_b32 a2, s6 -; GCN-NEXT: v_accvgpr_write_b32 a4, s6 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 -; GCN-NEXT: v_accvgpr_write_b32 a1, s7 -; GCN-NEXT: v_accvgpr_write_b32 a3, s7 -; GCN-NEXT: v_accvgpr_write_b32 a5, s7 -; GCN-NEXT: v_accvgpr_write_b32 a7, s7 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[8:9], s[8:9] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[8:9], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: s_mov_b64 s[6:7], s[0:1] +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16 ; GCN-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir index 67ed51a3..5f040779 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST -# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-fast -amdgpu-mfma-vgpr-form=0 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-greedy -amdgpu-mfma-vgpr-form=0 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY --- name: mfma_f32_32x32x4bf16_1k_vva diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir index e11586e..12208c1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST -# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-fast -amdgpu-mfma-vgpr-form=0 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-greedy -amdgpu-mfma-vgpr-form=0 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY --- name: mfma_i32_16x16x32_i8_vva diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll index 635d2a2..4258d1d 100644 --- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll +++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 -early-live-intervals < %s | FileCheck -enable-var-scope --check-prefix=GCN %s declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 9e24023..f2f41f4 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-mfma-vgpr-form=0 -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; This testcase would fail on GFX908 due to not having a free VGPR available to ; copy between AGPRs. diff --git a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll index fe8edd5..7644d89 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll @@ -1,8 +1,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s -; Make sure flag is ignored -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-vgpr-form=1 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX90A %s +; Make sure flag is ignored for gfx908 +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-vgpr-form=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX90A %s ; GFX9-DAG: buffer_load_format_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding: ; GFX9-DAG: buffer_load_format_d16_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll index 80f295b..51bcb39 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll @@ -6,146 +6,144 @@ define amdgpu_kernel void @MFMAExpInterleave(ptr addrspace(1) %out0, ptr addrspa ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s6, s[4:5], 0x10 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; GCN-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b -; GCN-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-NEXT: v_mov_b32_e32 v5, 0x3fb8aa3b +; GCN-NEXT: v_mov_b32_e32 v4, 1.0 ; GCN-NEXT: s_mov_b32 s7, 0x42b17218 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, s6, v1 -; GCN-NEXT: v_rndne_f32_e32 v3, v2 -; GCN-NEXT: v_sub_f32_e32 v4, v2, v3 -; GCN-NEXT: v_fma_f32 v1, s6, v1, -v2 -; GCN-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_fmac_f32_e32 v1, s6, v2 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_add_f32_e32 v1, v4, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v3 -; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] -; GCN-NEXT: v_exp_f32_e32 v1, v1 +; GCN-NEXT: v_mul_f32_e32 v6, s6, v5 +; GCN-NEXT: v_rndne_f32_e32 v7, v6 +; GCN-NEXT: v_sub_f32_e32 v8, v6, v7 +; GCN-NEXT: v_fma_f32 v5, s6, v5, -v6 +; GCN-NEXT: v_mov_b32_e32 v6, 0x32a5705f +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GCN-NEXT: v_fmac_f32_e32 v5, s6, v6 +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GCN-NEXT: v_add_f32_e32 v5, v8, v5 +; GCN-NEXT: v_cvt_i32_f32_e32 v6, v7 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3] +; GCN-NEXT: v_exp_f32_e32 v5, v5 ; GCN-NEXT: s_mov_b32 s0, 0x3fb8aa3b -; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3] ; GCN-NEXT: ; iglp_opt mask(0x00000003) -; GCN-NEXT: v_ldexp_f32 v1, v1, v2 -; GCN-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v2 -; GCN-NEXT: v_mov_b32_e32 v2, 0x42b17218 +; GCN-NEXT: v_ldexp_f32 v5, v5, v6 +; GCN-NEXT: v_mov_b32_e32 v6, 0xc2ce8ed0 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v6 +; GCN-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v2 -; GCN-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6 +; GCN-NEXT: v_mov_b32_e32 v6, 0x7f800000 ; GCN-NEXT: s_mov_b32 s6, 0xc2ce8ed0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 -; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 -; GCN-NEXT: v_rndne_f32_e32 v5, v3 -; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 -; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 -; GCN-NEXT: v_add_f32_e32 v3, v3, v4 -; GCN-NEXT: v_exp_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 -; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] -; GCN-NEXT: v_ldexp_f32 v3, v3, v4 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 +; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5 +; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7 +; GCN-NEXT: v_rndne_f32_e32 v9, v7 +; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5 +; GCN-NEXT: v_sub_f32_e32 v7, v7, v9 +; GCN-NEXT: v_add_f32_e32 v7, v7, v8 +; GCN-NEXT: v_exp_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3] +; GCN-NEXT: v_ldexp_f32 v7, v7, v8 +; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 -; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 -; GCN-NEXT: v_rndne_f32_e32 v5, v3 -; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 -; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 -; GCN-NEXT: v_add_f32_e32 v3, v3, v4 -; GCN-NEXT: v_exp_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 -; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] -; GCN-NEXT: v_ldexp_f32 v3, v3, v4 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 +; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5 +; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7 +; GCN-NEXT: v_rndne_f32_e32 v9, v7 +; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5 +; GCN-NEXT: v_sub_f32_e32 v7, v7, v9 +; GCN-NEXT: v_add_f32_e32 v7, v7, v8 +; GCN-NEXT: v_exp_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3] +; GCN-NEXT: v_ldexp_f32 v7, v7, v8 +; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 -; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 -; GCN-NEXT: v_rndne_f32_e32 v5, v3 -; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 -; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 -; GCN-NEXT: v_add_f32_e32 v3, v3, v4 -; GCN-NEXT: v_exp_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 -; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] -; GCN-NEXT: v_ldexp_f32 v3, v3, v4 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 +; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5 +; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7 +; GCN-NEXT: v_rndne_f32_e32 v9, v7 +; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5 +; GCN-NEXT: v_sub_f32_e32 v7, v7, v9 +; GCN-NEXT: v_add_f32_e32 v7, v7, v8 +; GCN-NEXT: v_exp_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3] +; GCN-NEXT: v_ldexp_f32 v7, v7, v8 +; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 -; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 -; GCN-NEXT: v_rndne_f32_e32 v5, v3 -; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 -; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 -; GCN-NEXT: v_add_f32_e32 v3, v3, v4 -; GCN-NEXT: v_exp_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 -; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] -; GCN-NEXT: v_ldexp_f32 v3, v3, v4 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 +; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5 +; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7 +; GCN-NEXT: v_rndne_f32_e32 v9, v7 +; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5 +; GCN-NEXT: v_sub_f32_e32 v7, v7, v9 +; GCN-NEXT: v_add_f32_e32 v7, v7, v8 +; GCN-NEXT: v_exp_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3] +; GCN-NEXT: v_ldexp_f32 v7, v7, v8 +; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 -; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 -; GCN-NEXT: v_rndne_f32_e32 v5, v3 -; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 -; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 -; GCN-NEXT: v_add_f32_e32 v3, v3, v4 -; GCN-NEXT: v_exp_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 -; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] -; GCN-NEXT: v_ldexp_f32 v3, v3, v4 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 +; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5 +; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7 +; GCN-NEXT: v_rndne_f32_e32 v9, v7 +; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5 +; GCN-NEXT: v_sub_f32_e32 v7, v7, v9 +; GCN-NEXT: v_add_f32_e32 v7, v7, v8 +; GCN-NEXT: v_exp_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3] +; GCN-NEXT: v_ldexp_f32 v7, v7, v8 +; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 -; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 -; GCN-NEXT: v_rndne_f32_e32 v5, v3 -; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 -; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 -; GCN-NEXT: v_add_f32_e32 v3, v3, v4 -; GCN-NEXT: v_exp_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 -; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 -; GCN-NEXT: v_ldexp_f32 v0, v3, v4 +; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5 +; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7 +; GCN-NEXT: v_rndne_f32_e32 v9, v7 +; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5 +; GCN-NEXT: v_sub_f32_e32 v7, v7, v9 +; GCN-NEXT: v_add_f32_e32 v7, v7, v8 +; GCN-NEXT: v_exp_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3] +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5 +; GCN-NEXT: v_ldexp_f32 v4, v7, v8 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 +; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 -; GCN-NEXT: v_fma_f32 v3, v0, s0, -v1 -; GCN-NEXT: v_rndne_f32_e32 v4, v1 -; GCN-NEXT: v_fmac_f32_e32 v3, 0x32a5705f, v0 -; GCN-NEXT: v_sub_f32_e32 v1, v1, v4 -; GCN-NEXT: v_add_f32_e32 v1, v1, v3 -; GCN-NEXT: v_exp_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v3, v4 +; GCN-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GCN-NEXT: v_mul_f32_e32 v5, 0x3fb8aa3b, v4 +; GCN-NEXT: v_fma_f32 v7, v4, s0, -v5 +; GCN-NEXT: v_rndne_f32_e32 v8, v5 +; GCN-NEXT: v_fmac_f32_e32 v7, 0x32a5705f, v4 +; GCN-NEXT: v_sub_f32_e32 v5, v5, v8 +; GCN-NEXT: v_add_f32_e32 v5, v5, v7 +; GCN-NEXT: v_exp_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_i32_f32_e32 v7, v8 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v0 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_ldexp_f32 v1, v1, v3 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v0 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_ldexp_f32 v5, v5, v7 +; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GCN-NEXT: global_store_dword v4, v0, s[2:3] +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc +; GCN-NEXT: global_store_dword v8, v4, s[2:3] ; GCN-NEXT: s_endpgm %mai0 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in1, i32 0, i32 0, i32 0) %mai1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai0, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 7959cee..b2931ad 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -check-prefix=GCN %s define amdgpu_kernel void @test_iglp_opt() #0 { ; GCN-LABEL: test_iglp_opt: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll index 12a998a..f0040b6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck --check-prefixes=GCN,GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck --check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 < %s | FileCheck --check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GFX90A-VGPR %s declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32) @@ -201,6 +202,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 ; GFX90A-NEXT: s_endpgm +; +; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x2bf16: +; GFX90A-VGPR: ; %bb.0: ; %bb +; GFX90A-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v33, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v34, 2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, s19 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, s20 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, s21 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, s22 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, s23 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, s24 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, s25 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s26 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s27 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s28 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s29 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v14, s30 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v15, s31 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, s0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, s1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v18, s2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v19, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v20, s4 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v21, s5 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v22, s6 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v23, s7 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v24, s8 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v25, s9 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v26, s10 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v27, s11 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v28, s12 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v29, s13 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v30, s14 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v31, s15 +; GFX90A-VGPR-NEXT: s_nop 1 +; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x2bf16 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: s_nop 15 +; GFX90A-VGPR-NEXT: s_nop 2 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 +; GFX90A-VGPR-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %a = bitcast i32 1 to <2 x i16> @@ -311,6 +368,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm +; +; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x2bf16: +; GFX90A-VGPR: ; %bb.0: ; %bb +; GFX90A-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, 2 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] +; GFX90A-VGPR-NEXT: s_nop 1 +; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x2bf16 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-VGPR-NEXT: s_nop 9 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-VGPR-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %a = bitcast i32 1 to <2 x i16> @@ -367,6 +450,23 @@ define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm +; +; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x2bf16: +; GFX90A-VGPR: ; %bb.0: ; %bb +; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: s_nop 1 +; GFX90A-VGPR-NEXT: v_mfma_f32_4x4x2bf16 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: s_nop 4 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX90A-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %a = bitcast i32 1 to <2 x i16> @@ -478,6 +578,33 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm +; +; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x4bf16: +; GFX90A-VGPR: ; %bb.0: ; %bb +; GFX90A-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, 2 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] +; GFX90A-VGPR-NEXT: s_nop 1 +; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x4bf16 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-VGPR-NEXT: s_nop 15 +; GFX90A-VGPR-NEXT: s_nop 1 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-VGPR-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %a = bitcast i32 1 to <2 x i16> @@ -534,6 +661,23 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_nop 10 ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm +; +; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x8bf16: +; GFX90A-VGPR: ; %bb.0: ; %bb +; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: s_nop 1 +; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x8bf16 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: s_nop 10 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX90A-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %a = bitcast i32 1 to <2 x i16> @@ -544,5 +688,3 @@ bb: } attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index 5ab8706..3236864 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -17,115 +17,115 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GFX90A-LABEL: test_mfma_f32_32x32x4bf16_1k: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2 +; GFX90A-NEXT: v_mov_b32_e32 v33, 0 +; GFX90A-NEXT: v_mov_b32_e32 v34, 1 +; GFX90A-NEXT: v_mov_b32_e32 v35, v33 +; GFX90A-NEXT: v_mov_b32_e32 v32, 2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v3, s19 +; GFX90A-NEXT: v_mov_b32_e32 v4, s20 +; GFX90A-NEXT: v_mov_b32_e32 v5, s21 +; GFX90A-NEXT: v_mov_b32_e32 v6, s22 +; GFX90A-NEXT: v_mov_b32_e32 v7, s23 +; GFX90A-NEXT: v_mov_b32_e32 v8, s24 +; GFX90A-NEXT: v_mov_b32_e32 v9, s25 +; GFX90A-NEXT: v_mov_b32_e32 v10, s26 +; GFX90A-NEXT: v_mov_b32_e32 v11, s27 +; GFX90A-NEXT: v_mov_b32_e32 v12, s28 +; GFX90A-NEXT: v_mov_b32_e32 v13, s29 +; GFX90A-NEXT: v_mov_b32_e32 v14, s30 +; GFX90A-NEXT: v_mov_b32_e32 v15, s31 +; GFX90A-NEXT: v_mov_b32_e32 v16, s0 +; GFX90A-NEXT: v_mov_b32_e32 v17, s1 +; GFX90A-NEXT: v_mov_b32_e32 v18, s2 +; GFX90A-NEXT: v_mov_b32_e32 v19, s3 +; GFX90A-NEXT: v_mov_b32_e32 v20, s4 +; GFX90A-NEXT: v_mov_b32_e32 v21, s5 +; GFX90A-NEXT: v_mov_b32_e32 v22, s6 +; GFX90A-NEXT: v_mov_b32_e32 v23, s7 +; GFX90A-NEXT: v_mov_b32_e32 v24, s8 +; GFX90A-NEXT: v_mov_b32_e32 v25, s9 +; GFX90A-NEXT: v_mov_b32_e32 v26, s10 +; GFX90A-NEXT: v_mov_b32_e32 v27, s11 +; GFX90A-NEXT: v_mov_b32_e32 v28, s12 +; GFX90A-NEXT: v_mov_b32_e32 v29, s13 +; GFX90A-NEXT: v_mov_b32_e32 v30, s14 +; GFX90A-NEXT: v_mov_b32_e32 v31, s15 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_32x32x4bf16_1k v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v1, a[16:19], s[34:35] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v1, a[20:23], s[34:35] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[34:35] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[34:35] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[34:35] -; GFX90A-NEXT: global_store_dwordx4 v1, a[4:7], s[34:35] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v33, v[16:19], s[34:35] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v33, v[20:23], s[34:35] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v33, v[8:11], s[34:35] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v33, v[12:15], s[34:35] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v33, v[0:3], s[34:35] +; GFX90A-NEXT: global_store_dwordx4 v33, v[4:7], s[34:35] offset:16 ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x4bf16_1k: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-NEXT: v_mov_b32_e32 v33, 0 +; GFX942-NEXT: v_mov_b32_e32 v34, 1 +; GFX942-NEXT: v_mov_b32_e32 v35, v33 +; GFX942-NEXT: v_mov_b32_e32 v32, 2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: v_mov_b32_e32 v1, s17 +; GFX942-NEXT: v_mov_b32_e32 v2, s18 +; GFX942-NEXT: v_mov_b32_e32 v3, s19 +; GFX942-NEXT: v_mov_b32_e32 v4, s20 +; GFX942-NEXT: v_mov_b32_e32 v5, s21 +; GFX942-NEXT: v_mov_b32_e32 v6, s22 +; GFX942-NEXT: v_mov_b32_e32 v7, s23 +; GFX942-NEXT: v_mov_b32_e32 v8, s24 +; GFX942-NEXT: v_mov_b32_e32 v9, s25 +; GFX942-NEXT: v_mov_b32_e32 v10, s26 +; GFX942-NEXT: v_mov_b32_e32 v11, s27 +; GFX942-NEXT: v_mov_b32_e32 v12, s28 +; GFX942-NEXT: v_mov_b32_e32 v13, s29 +; GFX942-NEXT: v_mov_b32_e32 v14, s30 +; GFX942-NEXT: v_mov_b32_e32 v15, s31 +; GFX942-NEXT: v_mov_b32_e32 v16, s0 +; GFX942-NEXT: v_mov_b32_e32 v17, s1 +; GFX942-NEXT: v_mov_b32_e32 v18, s2 +; GFX942-NEXT: v_mov_b32_e32 v19, s3 +; GFX942-NEXT: v_mov_b32_e32 v20, s4 +; GFX942-NEXT: v_mov_b32_e32 v21, s5 +; GFX942-NEXT: v_mov_b32_e32 v22, s6 +; GFX942-NEXT: v_mov_b32_e32 v23, s7 +; GFX942-NEXT: v_mov_b32_e32 v24, s8 +; GFX942-NEXT: v_mov_b32_e32 v25, s9 +; GFX942-NEXT: v_mov_b32_e32 v26, s10 +; GFX942-NEXT: v_mov_b32_e32 v27, s11 +; GFX942-NEXT: v_mov_b32_e32 v28, s12 +; GFX942-NEXT: v_mov_b32_e32 v29, s13 +; GFX942-NEXT: v_mov_b32_e32 v30, s14 +; GFX942-NEXT: v_mov_b32_e32 v31, s15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96 -; GFX942-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112 -; GFX942-NEXT: global_store_dwordx4 v1, a[16:19], s[34:35] offset:64 -; GFX942-NEXT: global_store_dwordx4 v1, a[20:23], s[34:35] offset:80 -; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[34:35] offset:32 -; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[34:35] offset:48 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[34:35] -; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[34:35] offset:16 +; GFX942-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96 +; GFX942-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112 +; GFX942-NEXT: global_store_dwordx4 v33, v[16:19], s[34:35] offset:64 +; GFX942-NEXT: global_store_dwordx4 v33, v[20:23], s[34:35] offset:80 +; GFX942-NEXT: global_store_dwordx4 v33, v[8:11], s[34:35] offset:32 +; GFX942-NEXT: global_store_dwordx4 v33, v[12:15], s[34:35] offset:48 +; GFX942-NEXT: global_store_dwordx4 v33, v[0:3], s[34:35] +; GFX942-NEXT: global_store_dwordx4 v33, v[4:7], s[34:35] offset:16 ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k: @@ -254,71 +254,55 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) # ; GFX90A-LABEL: test_mfma_f32_16x16x4bf16_1k: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 1 +; GFX90A-NEXT: v_mov_b32_e32 v19, v17 +; GFX90A-NEXT: v_mov_b32_e32 v16, 2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_16x16x4bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 10 -; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x4bf16_1k: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-NEXT: v_mov_b32_e32 v17, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 1 +; GFX942-NEXT: v_mov_b32_e32 v19, v17 +; GFX942-NEXT: v_mov_b32_e32 v16, 2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 10 -; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48 -; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32 -; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17] +; GFX942-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k: @@ -387,41 +371,37 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 ; GFX90A-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, 2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_4x4x4bf16_1k a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 4 -; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 1 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, 2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_4x4x4_16b_bf16 a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 4 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k: @@ -472,72 +452,56 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) # ; GFX90A-LABEL: test_mfma_f32_32x32x8bf16_1k: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 1 +; GFX90A-NEXT: v_mov_b32_e32 v19, v17 +; GFX90A-NEXT: v_mov_b32_e32 v16, 2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_32x32x8bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x8bf16_1k: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-NEXT: v_mov_b32_e32 v17, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 1 +; GFX942-NEXT: v_mov_b32_e32 v19, v17 +; GFX942-NEXT: v_mov_b32_e32 v16, 2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x8_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_32x32x8_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 10 -; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48 -; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32 -; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17] +; GFX942-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k: @@ -607,41 +571,37 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GFX90A-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, 2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 10 -; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 1 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, 2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_16x16x16_bf16 a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 6 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k: @@ -697,12 +657,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0 +; GFX90A-NEXT: v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0 ; GFX90A-NEXT: s_nop 3 -; GFX90A-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f64_4x4x4f64: @@ -713,12 +673,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], 0 +; GFX942-NEXT: v_mfma_f64_4x4x4_4b_f64 v[4:5], v[0:1], v[2:3], 0 ; GFX942-NEXT: s_nop 3 -; GFX942-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 neg:[1,1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f64_4x4x4f64: @@ -765,26 +725,22 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: v_mov_b32_e32 v10, s10 ; GFX90A-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s11 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v11, s11 +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f64_16x16x4f64: @@ -792,26 +748,22 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: v_mov_b32_e32 v10, s10 ; GFX942-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX942-NEXT: v_mov_b32_e32 v3, s11 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX942-NEXT: v_mov_b32_e32 v11, s11 +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[12:13] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64: @@ -872,16 +824,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1) ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0: @@ -889,16 +841,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 0 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0: @@ -947,16 +899,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1: @@ -964,16 +916,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1: @@ -1022,16 +974,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1) ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 1.0 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 1.0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1: @@ -1039,16 +991,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 1.0 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 1.0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1: @@ -1097,16 +1049,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1.0 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1.0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1: @@ -1114,16 +1066,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1.0 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1.0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1: @@ -1172,16 +1124,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 64 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 64 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64: @@ -1189,16 +1141,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 64 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 64 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64: @@ -1246,50 +1198,56 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 64 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 64 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a1 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 15 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 64 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 64 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a1 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-NEXT: s_nop 15 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits: @@ -1359,50 +1317,50 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 64 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 64 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 64 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX942-NEXT: v_mov_b32_e32 v0, 64 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low: @@ -1466,50 +1424,50 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_ ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low: @@ -1573,52 +1531,56 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x3ff00000 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x3ff00000 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v12, s2 +; GFX90A-NEXT: v_mov_b32_e32 v13, s3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9] ; GFX90A-NEXT: s_nop 15 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f64_16x16x4f64_imm: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x3ff00000 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x3ff00000 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s2 -; GFX942-NEXT: v_mov_b32_e32 v3, s3 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v12, s2 +; GFX942-NEXT: v_mov_b32_e32 v13, s3 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9] ; GFX942-NEXT: s_nop 15 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm: @@ -1687,52 +1649,56 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x405ec000 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x405ec000 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a1 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v12, s2 +; GFX90A-NEXT: v_mov_b32_e32 v13, s3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9] ; GFX90A-NEXT: s_nop 15 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_lit: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x405ec000 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x405ec000 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s2 -; GFX942-NEXT: v_mov_b32_e32 v3, s3 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v12, s2 +; GFX942-NEXT: v_mov_b32_e32 v13, s3 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9] ; GFX942-NEXT: s_nop 15 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll index dc4c9291..477c74c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll @@ -34,85 +34,77 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 { ; GFX942-SDAG-LABEL: test_mfma_i32_16x16x32i8: ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 3 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mfma_i32_16x16x32_i8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_i32_16x16x32i8: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, 1 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, 4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: v_mfma_i32_16x16x32_i8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_mfma_i32_16x16x32i8: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, 3 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: v_mfma_i32_16x16x32_i8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_i32_16x16x32i8: ; GFX950-GISEL: ; %bb.0: ; %bb ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, 1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: v_mfma_i32_16x16x32_i8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -165,145 +157,113 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 { ; GFX942-SDAG-LABEL: test_mfma_i32_32x32x16i8: ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-SDAG-NEXT: s_nop 9 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_i32_32x32x16i8: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v18, 4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_mfma_i32_32x32x16i8: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-SDAG-NEXT: s_nop 10 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_i32_32x32x16i8: ; GFX950-GISEL: ; %bb.0: ; %bb ; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX950-GISEL-NEXT: s_endpgm bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg @@ -316,85 +276,77 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) ; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 3 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, 1 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, 4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, 3 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8: ; GFX950-GISEL: ; %bb.0: ; %bb ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, 1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -447,85 +399,77 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) ; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 3 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, 1 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, 4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, 3 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8: ; GFX950-GISEL: ; %bb.0: ; %bb ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, 1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -578,85 +522,77 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) ; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 3 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, 1 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, 4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, 3 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8: ; GFX950-GISEL: ; %bb.0: ; %bb ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, 1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -709,85 +645,77 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) ; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 3 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, 1 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, 4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, 3 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8: ; GFX950-GISEL: ; %bb.0: ; %bb ; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, 1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX950-GISEL-NEXT: s_nop 6 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb @@ -840,145 +768,113 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) ; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_bf8: ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-SDAG-NEXT: s_nop 9 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_bf8: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v18, 4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_bf8: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-SDAG-NEXT: s_nop 10 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_bf8: ; GFX950-GISEL: ; %bb.0: ; %bb ; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX950-GISEL-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg @@ -991,145 +887,113 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) ; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_fp8: ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-SDAG-NEXT: s_nop 9 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_fp8: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v18, 4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_fp8: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-SDAG-NEXT: s_nop 10 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_fp8: ; GFX950-GISEL: ; %bb.0: ; %bb ; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX950-GISEL-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg @@ -1142,145 +1006,113 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) ; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_bf8: ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-SDAG-NEXT: s_nop 9 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_bf8: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v18, 4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_bf8: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-SDAG-NEXT: s_nop 10 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_bf8: ; GFX950-GISEL: ; %bb.0: ; %bb ; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX950-GISEL-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg @@ -1293,145 +1125,113 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) ; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_fp8: ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-SDAG-NEXT: s_nop 9 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_fp8: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v18, 4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_fp8: ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-SDAG-NEXT: s_nop 10 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_fp8: ; GFX950-GISEL: ; %bb.0: ; %bb ; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-GISEL-NEXT: s_nop 1 -; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-GISEL-NEXT: s_nop 10 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX950-GISEL-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 033a35f..951763b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefix=AGPR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefix=VGPR %s ; FIXME: bfloat vector arguments are broken in globalisel. ; https://github.com/llvm/llvm-project/issues/77055 @@ -77,6 +78,133 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm +; +; AGPR-LABEL: test_mfma_f32_32x32x16_bf16: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: v_mov_b64_e32 v[8:9], 48 +; AGPR-NEXT: v_mov_b64_e32 v[10:11], 32 +; AGPR-NEXT: v_mov_b64_e32 v[12:13], 16 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; AGPR-NEXT: v_accvgpr_write_b32 a0, s8 +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; AGPR-NEXT: v_accvgpr_write_b32 a1, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, s23 +; AGPR-NEXT: v_mov_b32_e32 v16, s16 +; AGPR-NEXT: v_mov_b32_e32 v17, s17 +; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] +; AGPR-NEXT: v_mov_b32_e32 v18, s18 +; AGPR-NEXT: v_mov_b32_e32 v19, s19 +; AGPR-NEXT: v_mov_b32_e32 v0, s20 +; AGPR-NEXT: v_mov_b32_e32 v1, s21 +; AGPR-NEXT: v_mov_b32_e32 v2, s22 +; AGPR-NEXT: v_mov_b32_e32 v3, s23 +; AGPR-NEXT: v_mov_b64_e32 v[14:15], 0 +; AGPR-NEXT: s_nop 4 +; AGPR-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s8 +; AGPR-NEXT: v_mov_b32_e32 v1, s9 +; AGPR-NEXT: v_mov_b32_e32 v2, s10 +; AGPR-NEXT: v_mov_b32_e32 v3, s11 +; AGPR-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s12 +; AGPR-NEXT: v_mov_b32_e32 v1, s13 +; AGPR-NEXT: v_mov_b32_e32 v2, s14 +; AGPR-NEXT: v_mov_b32_e32 v3, s15 +; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_endpgm +; +; VGPR-LABEL: test_mfma_f32_32x32x16_bf16: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: v_mov_b64_e32 v[40:41], 48 +; VGPR-NEXT: v_mov_b64_e32 v[42:43], 32 +; VGPR-NEXT: v_mov_b64_e32 v[44:45], 16 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPR-NEXT: v_mov_b32_e32 v48, s16 +; VGPR-NEXT: v_mov_b32_e32 v49, s17 +; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[16:31], v[32:35], v[36:39], v[0:15] +; VGPR-NEXT: v_mov_b32_e32 v50, s18 +; VGPR-NEXT: v_mov_b32_e32 v51, s19 +; VGPR-NEXT: v_mov_b64_e32 v[46:47], 0 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v0, s20 +; VGPR-NEXT: v_mov_b32_e32 v1, s21 +; VGPR-NEXT: v_mov_b32_e32 v2, s22 +; VGPR-NEXT: v_mov_b32_e32 v3, s23 +; VGPR-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s8 +; VGPR-NEXT: v_mov_b32_e32 v1, s9 +; VGPR-NEXT: v_mov_b32_e32 v2, s10 +; VGPR-NEXT: v_mov_b32_e32 v3, s11 +; VGPR-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s12 +; VGPR-NEXT: v_mov_b32_e32 v1, s13 +; VGPR-NEXT: v_mov_b32_e32 v2, s14 +; VGPR-NEXT: v_mov_b32_e32 v3, s15 +; VGPR-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) store volatile <16 x float> %result, ptr addrspace(1) null store volatile <16 x float> %arg2, ptr addrspace(1) null @@ -150,6 +278,133 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm +; +; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: v_mov_b64_e32 v[8:9], 48 +; AGPR-NEXT: v_mov_b64_e32 v[10:11], 32 +; AGPR-NEXT: v_mov_b64_e32 v[12:13], 16 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; AGPR-NEXT: v_accvgpr_write_b32 a0, s8 +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; AGPR-NEXT: v_accvgpr_write_b32 a1, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, s23 +; AGPR-NEXT: v_mov_b32_e32 v16, s16 +; AGPR-NEXT: v_mov_b32_e32 v17, s17 +; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; AGPR-NEXT: v_mov_b32_e32 v18, s18 +; AGPR-NEXT: v_mov_b32_e32 v19, s19 +; AGPR-NEXT: v_mov_b32_e32 v0, s20 +; AGPR-NEXT: v_mov_b32_e32 v1, s21 +; AGPR-NEXT: v_mov_b32_e32 v2, s22 +; AGPR-NEXT: v_mov_b32_e32 v3, s23 +; AGPR-NEXT: v_mov_b64_e32 v[14:15], 0 +; AGPR-NEXT: s_nop 4 +; AGPR-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s8 +; AGPR-NEXT: v_mov_b32_e32 v1, s9 +; AGPR-NEXT: v_mov_b32_e32 v2, s10 +; AGPR-NEXT: v_mov_b32_e32 v3, s11 +; AGPR-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s12 +; AGPR-NEXT: v_mov_b32_e32 v1, s13 +; AGPR-NEXT: v_mov_b32_e32 v2, s14 +; AGPR-NEXT: v_mov_b32_e32 v3, s15 +; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_endpgm +; +; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: v_mov_b64_e32 v[40:41], 48 +; VGPR-NEXT: v_mov_b64_e32 v[42:43], 32 +; VGPR-NEXT: v_mov_b64_e32 v[44:45], 16 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPR-NEXT: v_mov_b32_e32 v48, s16 +; VGPR-NEXT: v_mov_b32_e32 v49, s17 +; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1 +; VGPR-NEXT: v_mov_b32_e32 v50, s18 +; VGPR-NEXT: v_mov_b32_e32 v51, s19 +; VGPR-NEXT: v_mov_b64_e32 v[46:47], 0 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v0, s20 +; VGPR-NEXT: v_mov_b32_e32 v1, s21 +; VGPR-NEXT: v_mov_b32_e32 v2, s22 +; VGPR-NEXT: v_mov_b32_e32 v3, s23 +; VGPR-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s8 +; VGPR-NEXT: v_mov_b32_e32 v1, s9 +; VGPR-NEXT: v_mov_b32_e32 v2, s10 +; VGPR-NEXT: v_mov_b32_e32 v3, s11 +; VGPR-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s12 +; VGPR-NEXT: v_mov_b32_e32 v1, s13 +; VGPR-NEXT: v_mov_b32_e32 v2, s14 +; VGPR-NEXT: v_mov_b32_e32 v3, s15 +; VGPR-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1) store volatile <16 x float> %result, ptr addrspace(1) null store volatile <16 x float> %arg2, ptr addrspace(1) null @@ -196,6 +451,71 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__mac: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, v8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, v9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, v10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, v11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, v12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, v13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, v14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, v15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, v16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, v17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, v18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, v19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, v20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, v21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, v22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, v23 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: s_nop 3 +; AGPR-NEXT: v_accvgpr_read_b32 v0, a0 +; AGPR-NEXT: v_accvgpr_read_b32 v1, a1 +; AGPR-NEXT: v_accvgpr_read_b32 v2, a2 +; AGPR-NEXT: v_accvgpr_read_b32 v3, a3 +; AGPR-NEXT: v_accvgpr_read_b32 v4, a4 +; AGPR-NEXT: v_accvgpr_read_b32 v5, a5 +; AGPR-NEXT: v_accvgpr_read_b32 v6, a6 +; AGPR-NEXT: v_accvgpr_read_b32 v7, a7 +; AGPR-NEXT: v_accvgpr_read_b32 v8, a8 +; AGPR-NEXT: v_accvgpr_read_b32 v9, a9 +; AGPR-NEXT: v_accvgpr_read_b32 v10, a10 +; AGPR-NEXT: v_accvgpr_read_b32 v11, a11 +; AGPR-NEXT: v_accvgpr_read_b32 v12, a12 +; AGPR-NEXT: v_accvgpr_read_b32 v13, a13 +; AGPR-NEXT: v_accvgpr_read_b32 v14, a14 +; AGPR-NEXT: v_accvgpr_read_b32 v15, a15 +; AGPR-NEXT: s_setpc_b64 s[30:31] +; +; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__mac: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[8:23], v[0:3], v[4:7], v[8:23] +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 3 +; VGPR-NEXT: v_mov_b32_e32 v0, v8 +; VGPR-NEXT: v_mov_b32_e32 v1, v9 +; VGPR-NEXT: v_mov_b32_e32 v2, v10 +; VGPR-NEXT: v_mov_b32_e32 v3, v11 +; VGPR-NEXT: v_mov_b32_e32 v4, v12 +; VGPR-NEXT: v_mov_b32_e32 v5, v13 +; VGPR-NEXT: v_mov_b32_e32 v6, v14 +; VGPR-NEXT: v_mov_b32_e32 v7, v15 +; VGPR-NEXT: v_mov_b32_e32 v8, v16 +; VGPR-NEXT: v_mov_b32_e32 v9, v17 +; VGPR-NEXT: v_mov_b32_e32 v10, v18 +; VGPR-NEXT: v_mov_b32_e32 v11, v19 +; VGPR-NEXT: v_mov_b32_e32 v12, v20 +; VGPR-NEXT: v_mov_b32_e32 v13, v21 +; VGPR-NEXT: v_mov_b32_e32 v14, v22 +; VGPR-NEXT: v_mov_b32_e32 v15, v23 +; VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -240,6 +560,71 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0, ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, v8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, v9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, v10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, v11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, v12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, v13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, v14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, v15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, v16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, v17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, v18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, v19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, v20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, v21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, v22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, v23 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: s_nop 3 +; AGPR-NEXT: v_accvgpr_read_b32 v0, a0 +; AGPR-NEXT: v_accvgpr_read_b32 v1, a1 +; AGPR-NEXT: v_accvgpr_read_b32 v2, a2 +; AGPR-NEXT: v_accvgpr_read_b32 v3, a3 +; AGPR-NEXT: v_accvgpr_read_b32 v4, a4 +; AGPR-NEXT: v_accvgpr_read_b32 v5, a5 +; AGPR-NEXT: v_accvgpr_read_b32 v6, a6 +; AGPR-NEXT: v_accvgpr_read_b32 v7, a7 +; AGPR-NEXT: v_accvgpr_read_b32 v8, a8 +; AGPR-NEXT: v_accvgpr_read_b32 v9, a9 +; AGPR-NEXT: v_accvgpr_read_b32 v10, a10 +; AGPR-NEXT: v_accvgpr_read_b32 v11, a11 +; AGPR-NEXT: v_accvgpr_read_b32 v12, a12 +; AGPR-NEXT: v_accvgpr_read_b32 v13, a13 +; AGPR-NEXT: v_accvgpr_read_b32 v14, a14 +; AGPR-NEXT: v_accvgpr_read_b32 v15, a15 +; AGPR-NEXT: s_setpc_b64 s[30:31] +; +; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 3 +; VGPR-NEXT: v_mov_b32_e32 v0, v8 +; VGPR-NEXT: v_mov_b32_e32 v1, v9 +; VGPR-NEXT: v_mov_b32_e32 v2, v10 +; VGPR-NEXT: v_mov_b32_e32 v3, v11 +; VGPR-NEXT: v_mov_b32_e32 v4, v12 +; VGPR-NEXT: v_mov_b32_e32 v5, v13 +; VGPR-NEXT: v_mov_b32_e32 v6, v14 +; VGPR-NEXT: v_mov_b32_e32 v7, v15 +; VGPR-NEXT: v_mov_b32_e32 v8, v16 +; VGPR-NEXT: v_mov_b32_e32 v9, v17 +; VGPR-NEXT: v_mov_b32_e32 v10, v18 +; VGPR-NEXT: v_mov_b32_e32 v11, v19 +; VGPR-NEXT: v_mov_b32_e32 v12, v20 +; VGPR-NEXT: v_mov_b32_e32 v13, v21 +; VGPR-NEXT: v_mov_b32_e32 v14, v22 +; VGPR-NEXT: v_mov_b32_e32 v15, v23 +; VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1) ret <16 x float> %result } @@ -301,6 +686,120 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg ; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm +; +; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; AGPR-NEXT: v_mov_b32_e32 v44, 0 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; AGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; AGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; AGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; AGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; AGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; AGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; AGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; AGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; AGPR-NEXT: v_mov_b32_e32 v40, s20 +; AGPR-NEXT: v_mov_b32_e32 v41, s21 +; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] +; AGPR-NEXT: v_mov_b32_e32 v42, s22 +; AGPR-NEXT: v_mov_b32_e32 v43, s23 +; AGPR-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 2 +; AGPR-NEXT: v_mov_b32_e32 v16, s16 +; AGPR-NEXT: v_mov_b32_e32 v17, s17 +; AGPR-NEXT: v_mov_b32_e32 v18, s18 +; AGPR-NEXT: v_mov_b32_e32 v19, s19 +; AGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v16, s12 +; AGPR-NEXT: v_mov_b32_e32 v17, s13 +; AGPR-NEXT: v_mov_b32_e32 v18, s14 +; AGPR-NEXT: v_mov_b32_e32 v19, s15 +; AGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v16, s8 +; AGPR-NEXT: v_mov_b32_e32 v17, s9 +; AGPR-NEXT: v_mov_b32_e32 v18, s10 +; AGPR-NEXT: v_mov_b32_e32 v19, s11 +; AGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_endpgm +; +; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPR-NEXT: v_mov_b32_e32 v44, 0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; VGPR-NEXT: v_mov_b32_e32 v40, s20 +; VGPR-NEXT: v_mov_b32_e32 v41, s21 +; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] +; VGPR-NEXT: v_mov_b32_e32 v42, s22 +; VGPR-NEXT: v_mov_b32_e32 v43, s23 +; VGPR-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 2 +; VGPR-NEXT: v_mov_b32_e32 v16, s16 +; VGPR-NEXT: v_mov_b32_e32 v17, s17 +; VGPR-NEXT: v_mov_b32_e32 v18, s18 +; VGPR-NEXT: v_mov_b32_e32 v19, s19 +; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s12 +; VGPR-NEXT: v_mov_b32_e32 v17, s13 +; VGPR-NEXT: v_mov_b32_e32 v18, s14 +; VGPR-NEXT: v_mov_b32_e32 v19, s15 +; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s8 +; VGPR-NEXT: v_mov_b32_e32 v17, s9 +; VGPR-NEXT: v_mov_b32_e32 v18, s10 +; VGPR-NEXT: v_mov_b32_e32 v19, s11 +; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) store volatile <16 x float> %arg2, ptr addrspace(1) %out store volatile <16 x float> %result, ptr addrspace(1) %out @@ -364,6 +863,120 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa ; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm +; +; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; AGPR-NEXT: v_mov_b32_e32 v44, 0 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; AGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; AGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; AGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; AGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; AGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; AGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; AGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; AGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; AGPR-NEXT: v_mov_b32_e32 v40, s20 +; AGPR-NEXT: v_mov_b32_e32 v41, s21 +; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; AGPR-NEXT: v_mov_b32_e32 v42, s22 +; AGPR-NEXT: v_mov_b32_e32 v43, s23 +; AGPR-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 2 +; AGPR-NEXT: v_mov_b32_e32 v16, s16 +; AGPR-NEXT: v_mov_b32_e32 v17, s17 +; AGPR-NEXT: v_mov_b32_e32 v18, s18 +; AGPR-NEXT: v_mov_b32_e32 v19, s19 +; AGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v16, s12 +; AGPR-NEXT: v_mov_b32_e32 v17, s13 +; AGPR-NEXT: v_mov_b32_e32 v18, s14 +; AGPR-NEXT: v_mov_b32_e32 v19, s15 +; AGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v16, s8 +; AGPR-NEXT: v_mov_b32_e32 v17, s9 +; AGPR-NEXT: v_mov_b32_e32 v18, s10 +; AGPR-NEXT: v_mov_b32_e32 v19, s11 +; AGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_endpgm +; +; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPR-NEXT: v_mov_b32_e32 v44, 0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; VGPR-NEXT: v_mov_b32_e32 v40, s20 +; VGPR-NEXT: v_mov_b32_e32 v41, s21 +; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; VGPR-NEXT: v_mov_b32_e32 v42, s22 +; VGPR-NEXT: v_mov_b32_e32 v43, s23 +; VGPR-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 2 +; VGPR-NEXT: v_mov_b32_e32 v16, s16 +; VGPR-NEXT: v_mov_b32_e32 v17, s17 +; VGPR-NEXT: v_mov_b32_e32 v18, s18 +; VGPR-NEXT: v_mov_b32_e32 v19, s19 +; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s12 +; VGPR-NEXT: v_mov_b32_e32 v17, s13 +; VGPR-NEXT: v_mov_b32_e32 v18, s14 +; VGPR-NEXT: v_mov_b32_e32 v19, s15 +; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s8 +; VGPR-NEXT: v_mov_b32_e32 v17, s9 +; VGPR-NEXT: v_mov_b32_e32 v18, s10 +; VGPR-NEXT: v_mov_b32_e32 v19, s11 +; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3) store volatile <16 x float> %arg2, ptr addrspace(1) %out store volatile <16 x float> %result, ptr addrspace(1) %out @@ -398,6 +1011,64 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat> ; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GCN-NEXT: s_endpgm +; +; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd_mac: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; AGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; AGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; AGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; AGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] +; AGPR-NEXT: v_mov_b32_e32 v16, 0 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: s_nop 2 +; AGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; AGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; AGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; AGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; AGPR-NEXT: s_endpgm +; +; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd_mac: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] +; VGPR-NEXT: v_mov_b32_e32 v16, 0 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 2 +; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; VGPR-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) store <16 x float> %result, ptr addrspace(1) %out ret void @@ -431,6 +1102,64 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf ; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GCN-NEXT: s_endpgm +; +; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; AGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; AGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; AGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; AGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; AGPR-NEXT: v_mov_b32_e32 v16, 0 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: s_nop 2 +; AGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; AGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; AGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; AGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; AGPR-NEXT: s_endpgm +; +; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; VGPR-NEXT: v_mov_b32_e32 v16, 0 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 2 +; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; VGPR-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1) store <16 x float> %result, ptr addrspace(1) %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 7532062..65beb18 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -15,17 +15,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg ; GCN-LABEL: test_mfma_f32_16x16x32_f16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16: @@ -77,17 +67,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal ; GCN-LABEL: test_mfma_f32_16x16x32_f16__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16__flags: @@ -382,66 +362,58 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 -; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 +; SDAG-NEXT: v_mov_b64_e32 v[40:41], 48 +; SDAG-NEXT: v_mov_b64_e32 v[42:43], 32 +; SDAG-NEXT: v_mov_b64_e32 v[44:45], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 -; SDAG-NEXT: s_nop 4 -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: v_mov_b32_e32 v48, s16 +; SDAG-NEXT: v_mov_b32_e32 v49, s17 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] +; SDAG-NEXT: v_mov_b32_e32 v50, s18 +; SDAG-NEXT: v_mov_b32_e32 v51, s19 +; SDAG-NEXT: v_mov_b64_e32 v[46:47], 0 +; SDAG-NEXT: s_nop 8 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -449,58 +421,50 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32 +; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0 +; GISEL-NEXT: v_mov_b64_e32 v[46:47], 16 +; GISEL-NEXT: v_mov_b64_e32 v[48:49], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: s_nop 8 -; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], 48 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: s_nop 10 +; GISEL-NEXT: global_store_dwordx4 v[44:45], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[46:47], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[44:45], v[40:43], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -765,66 +729,58 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 -; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 +; SDAG-NEXT: v_mov_b64_e32 v[40:41], 48 +; SDAG-NEXT: v_mov_b64_e32 v[42:43], 32 +; SDAG-NEXT: v_mov_b64_e32 v[44:45], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 -; SDAG-NEXT: s_nop 4 -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: v_mov_b32_e32 v48, s16 +; SDAG-NEXT: v_mov_b32_e32 v49, s17 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v50, s18 +; SDAG-NEXT: v_mov_b32_e32 v51, s19 +; SDAG-NEXT: v_mov_b64_e32 v[46:47], 0 +; SDAG-NEXT: s_nop 8 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -832,58 +788,50 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32 +; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0 +; GISEL-NEXT: v_mov_b64_e32 v[46:47], 16 +; GISEL-NEXT: v_mov_b64_e32 v[48:49], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: s_nop 8 -; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], 48 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: s_nop 10 +; GISEL-NEXT: global_store_dwordx4 v[44:45], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[46:47], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[44:45], v[40:43], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -1147,41 +1095,24 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half ; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v8 +; GCN-NEXT: v_mov_b32_e32 v1, v9 +; GCN-NEXT: v_mov_b32_e32 v2, v10 +; GCN-NEXT: v_mov_b32_e32 v3, v11 +; GCN-NEXT: v_mov_b32_e32 v4, v12 +; GCN-NEXT: v_mov_b32_e32 v5, v13 +; GCN-NEXT: v_mov_b32_e32 v6, v14 +; GCN-NEXT: v_mov_b32_e32 v7, v15 +; GCN-NEXT: v_mov_b32_e32 v8, v16 +; GCN-NEXT: v_mov_b32_e32 v9, v17 +; GCN-NEXT: v_mov_b32_e32 v10, v18 +; GCN-NEXT: v_mov_b32_e32 v11, v19 +; GCN-NEXT: v_mov_b32_e32 v12, v20 +; GCN-NEXT: v_mov_b32_e32 v13, v21 +; GCN-NEXT: v_mov_b32_e32 v14, v22 +; GCN-NEXT: v_mov_b32_e32 v15, v23 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac: @@ -1317,41 +1248,24 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 ; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v8 +; GCN-NEXT: v_mov_b32_e32 v1, v9 +; GCN-NEXT: v_mov_b32_e32 v2, v10 +; GCN-NEXT: v_mov_b32_e32 v3, v11 +; GCN-NEXT: v_mov_b32_e32 v4, v12 +; GCN-NEXT: v_mov_b32_e32 v5, v13 +; GCN-NEXT: v_mov_b32_e32 v6, v14 +; GCN-NEXT: v_mov_b32_e32 v7, v15 +; GCN-NEXT: v_mov_b32_e32 v8, v16 +; GCN-NEXT: v_mov_b32_e32 v9, v17 +; GCN-NEXT: v_mov_b32_e32 v10, v18 +; GCN-NEXT: v_mov_b32_e32 v11, v19 +; GCN-NEXT: v_mov_b32_e32 v12, v20 +; GCN-NEXT: v_mov_b32_e32 v13, v21 +; GCN-NEXT: v_mov_b32_e32 v14, v22 +; GCN-NEXT: v_mov_b32_e32 v15, v23 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: @@ -2539,17 +2453,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 ; GCN-LABEL: test_mfma_i32_16x16x64_i8: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8: @@ -2601,17 +2505,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a ; GCN-LABEL: test_mfma_i32_16x16x64_i8__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8__flags: @@ -2964,70 +2858,67 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: v_mov_b64_e32 v[32:33], 48 +; SDAG-NEXT: v_mov_b64_e32 v[34:35], 32 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_mov_b32_e32 v8, s28 -; SDAG-NEXT: v_mov_b32_e32 v9, s29 -; SDAG-NEXT: v_mov_b32_e32 v10, s30 -; SDAG-NEXT: v_mov_b32_e32 v11, s31 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b32_e32 v36, s24 +; SDAG-NEXT: v_mov_b32_e32 v37, s25 +; SDAG-NEXT: v_mov_b32_e32 v38, s26 +; SDAG-NEXT: v_mov_b32_e32 v39, s27 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b32_e32 v40, s28 +; SDAG-NEXT: v_mov_b32_e32 v41, s29 +; SDAG-NEXT: v_mov_b32_e32 v42, s30 +; SDAG-NEXT: v_mov_b32_e32 v43, s31 +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[4:7], v[8:11], a[16:31] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: s_nop 5 -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 0 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -3035,58 +2926,50 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32 +; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0 +; GISEL-NEXT: v_mov_b64_e32 v[46:47], 16 +; GISEL-NEXT: v_mov_b64_e32 v[48:49], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: s_nop 8 -; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], 48 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: s_nop 10 +; GISEL-NEXT: global_store_dwordx4 v[44:45], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[46:47], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[44:45], v[40:43], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -3376,70 +3259,67 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: v_mov_b64_e32 v[32:33], 48 +; SDAG-NEXT: v_mov_b64_e32 v[34:35], 32 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_mov_b32_e32 v8, s28 -; SDAG-NEXT: v_mov_b32_e32 v9, s29 -; SDAG-NEXT: v_mov_b32_e32 v10, s30 -; SDAG-NEXT: v_mov_b32_e32 v11, s31 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b32_e32 v36, s24 +; SDAG-NEXT: v_mov_b32_e32 v37, s25 +; SDAG-NEXT: v_mov_b32_e32 v38, s26 +; SDAG-NEXT: v_mov_b32_e32 v39, s27 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b32_e32 v40, s28 +; SDAG-NEXT: v_mov_b32_e32 v41, s29 +; SDAG-NEXT: v_mov_b32_e32 v42, s30 +; SDAG-NEXT: v_mov_b32_e32 v43, s31 +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[4:7], v[8:11], a[16:31] cbsz:2 abid:3 blgp:1 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: s_nop 5 -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1 +; SDAG-NEXT: s_nop 11 +; SDAG-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 0 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -3447,58 +3327,50 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32 +; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0 +; GISEL-NEXT: v_mov_b64_e32 v[46:47], 16 +; GISEL-NEXT: v_mov_b64_e32 v[48:49], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: s_nop 8 -; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], 48 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: s_nop 10 +; GISEL-NEXT: global_store_dwordx4 v[44:45], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[46:47], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[44:45], v[40:43], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -3787,41 +3659,24 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar ; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; GCN-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v8 +; GCN-NEXT: v_mov_b32_e32 v1, v9 +; GCN-NEXT: v_mov_b32_e32 v2, v10 +; GCN-NEXT: v_mov_b32_e32 v3, v11 +; GCN-NEXT: v_mov_b32_e32 v4, v12 +; GCN-NEXT: v_mov_b32_e32 v5, v13 +; GCN-NEXT: v_mov_b32_e32 v6, v14 +; GCN-NEXT: v_mov_b32_e32 v7, v15 +; GCN-NEXT: v_mov_b32_e32 v8, v16 +; GCN-NEXT: v_mov_b32_e32 v9, v17 +; GCN-NEXT: v_mov_b32_e32 v10, v18 +; GCN-NEXT: v_mov_b32_e32 v11, v19 +; GCN-NEXT: v_mov_b32_e32 v12, v20 +; GCN-NEXT: v_mov_b32_e32 v13, v21 +; GCN-NEXT: v_mov_b32_e32 v14, v22 +; GCN-NEXT: v_mov_b32_e32 v15, v23 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac: @@ -3957,41 +3812,24 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i ; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; GCN-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v8 +; GCN-NEXT: v_mov_b32_e32 v1, v9 +; GCN-NEXT: v_mov_b32_e32 v2, v10 +; GCN-NEXT: v_mov_b32_e32 v3, v11 +; GCN-NEXT: v_mov_b32_e32 v4, v12 +; GCN-NEXT: v_mov_b32_e32 v5, v13 +; GCN-NEXT: v_mov_b32_e32 v6, v14 +; GCN-NEXT: v_mov_b32_e32 v7, v15 +; GCN-NEXT: v_mov_b32_e32 v8, v16 +; GCN-NEXT: v_mov_b32_e32 v9, v17 +; GCN-NEXT: v_mov_b32_e32 v10, v18 +; GCN-NEXT: v_mov_b32_e32 v11, v19 +; GCN-NEXT: v_mov_b32_e32 v12, v20 +; GCN-NEXT: v_mov_b32_e32 v13, v21 +; GCN-NEXT: v_mov_b32_e32 v14, v22 +; GCN-NEXT: v_mov_b32_e32 v15, v23 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: @@ -5299,17 +5137,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> ; GCN-LABEL: test_mfma_f32_16x16x32_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16: @@ -5361,17 +5189,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x ; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16__flags: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll index d24f1f0..61593a8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck --check-prefixes=GCN,GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A-VGPR %s declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32) declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32) @@ -109,6 +110,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm +; +; GFX90A-VGPR-LABEL: test_mfma_i32_32x32x8i8: +; GFX90A-VGPR: ; %bb.0: ; %bb +; GFX90A-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, 2 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] +; GFX90A-VGPR-NEXT: s_nop 1 +; GFX90A-VGPR-NEXT: v_mfma_i32_32x32x8i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-VGPR-NEXT: s_nop 15 +; GFX90A-VGPR-NEXT: s_nop 1 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-VGPR-NEXT: s_endpgm bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3) @@ -163,6 +191,23 @@ define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_nop 10 ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm +; +; GFX90A-VGPR-LABEL: test_mfma_i32_16x16x16i8: +; GFX90A-VGPR: ; %bb.0: ; %bb +; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: s_nop 1 +; GFX90A-VGPR-NEXT: v_mfma_i32_16x16x16i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: s_nop 10 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX90A-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 7e30af9..c31ea52 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -294,113 +294,113 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GFX90A-LABEL: test_mfma_f32_32x32x1f32: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v34, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v3, s19 +; GFX90A-NEXT: v_mov_b32_e32 v4, s20 +; GFX90A-NEXT: v_mov_b32_e32 v5, s21 +; GFX90A-NEXT: v_mov_b32_e32 v6, s22 +; GFX90A-NEXT: v_mov_b32_e32 v7, s23 +; GFX90A-NEXT: v_mov_b32_e32 v8, s24 +; GFX90A-NEXT: v_mov_b32_e32 v9, s25 +; GFX90A-NEXT: v_mov_b32_e32 v10, s26 +; GFX90A-NEXT: v_mov_b32_e32 v11, s27 +; GFX90A-NEXT: v_mov_b32_e32 v12, s28 +; GFX90A-NEXT: v_mov_b32_e32 v13, s29 +; GFX90A-NEXT: v_mov_b32_e32 v14, s30 +; GFX90A-NEXT: v_mov_b32_e32 v15, s31 +; GFX90A-NEXT: v_mov_b32_e32 v16, s0 +; GFX90A-NEXT: v_mov_b32_e32 v17, s1 +; GFX90A-NEXT: v_mov_b32_e32 v18, s2 +; GFX90A-NEXT: v_mov_b32_e32 v19, s3 +; GFX90A-NEXT: v_mov_b32_e32 v20, s4 +; GFX90A-NEXT: v_mov_b32_e32 v21, s5 +; GFX90A-NEXT: v_mov_b32_e32 v22, s6 +; GFX90A-NEXT: v_mov_b32_e32 v23, s7 +; GFX90A-NEXT: v_mov_b32_e32 v24, s8 +; GFX90A-NEXT: v_mov_b32_e32 v25, s9 +; GFX90A-NEXT: v_mov_b32_e32 v26, s10 +; GFX90A-NEXT: v_mov_b32_e32 v27, s11 +; GFX90A-NEXT: v_mov_b32_e32 v28, s12 +; GFX90A-NEXT: v_mov_b32_e32 v29, s13 +; GFX90A-NEXT: v_mov_b32_e32 v30, s14 +; GFX90A-NEXT: v_mov_b32_e32 v31, s15 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x1f32: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v34, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: v_mov_b32_e32 v1, s17 +; GFX942-NEXT: v_mov_b32_e32 v2, s18 +; GFX942-NEXT: v_mov_b32_e32 v3, s19 +; GFX942-NEXT: v_mov_b32_e32 v4, s20 +; GFX942-NEXT: v_mov_b32_e32 v5, s21 +; GFX942-NEXT: v_mov_b32_e32 v6, s22 +; GFX942-NEXT: v_mov_b32_e32 v7, s23 +; GFX942-NEXT: v_mov_b32_e32 v8, s24 +; GFX942-NEXT: v_mov_b32_e32 v9, s25 +; GFX942-NEXT: v_mov_b32_e32 v10, s26 +; GFX942-NEXT: v_mov_b32_e32 v11, s27 +; GFX942-NEXT: v_mov_b32_e32 v12, s28 +; GFX942-NEXT: v_mov_b32_e32 v13, s29 +; GFX942-NEXT: v_mov_b32_e32 v14, s30 +; GFX942-NEXT: v_mov_b32_e32 v15, s31 +; GFX942-NEXT: v_mov_b32_e32 v16, s0 +; GFX942-NEXT: v_mov_b32_e32 v17, s1 +; GFX942-NEXT: v_mov_b32_e32 v18, s2 +; GFX942-NEXT: v_mov_b32_e32 v19, s3 +; GFX942-NEXT: v_mov_b32_e32 v20, s4 +; GFX942-NEXT: v_mov_b32_e32 v21, s5 +; GFX942-NEXT: v_mov_b32_e32 v22, s6 +; GFX942-NEXT: v_mov_b32_e32 v23, s7 +; GFX942-NEXT: v_mov_b32_e32 v24, s8 +; GFX942-NEXT: v_mov_b32_e32 v25, s9 +; GFX942-NEXT: v_mov_b32_e32 v26, s10 +; GFX942-NEXT: v_mov_b32_e32 v27, s11 +; GFX942-NEXT: v_mov_b32_e32 v28, s12 +; GFX942-NEXT: v_mov_b32_e32 v29, s13 +; GFX942-NEXT: v_mov_b32_e32 v30, s14 +; GFX942-NEXT: v_mov_b32_e32 v31, s15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32: @@ -603,69 +603,53 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GFX90A-LABEL: test_mfma_f32_16x16x1f32: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v17, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: s_nop 9 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x1f32: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v16, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v17, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: s_nop 8 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32: @@ -760,39 +744,35 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GFX90A-LABEL: test_mfma_f32_4x4x1f32: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 4 -; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x1f32: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 3 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32: @@ -956,71 +936,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { ; GFX90A-LABEL: test_mfma_f32_32x32x2f32: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v17, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x2f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x2f32: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v16, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v17, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x2_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x2f32: @@ -1116,39 +1080,35 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { ; GFX90A-LABEL: test_mfma_f32_16x16x4f32: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_16x16x4f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 10 -; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x4f32: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_16x16x4_f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 9 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4f32: @@ -1456,121 +1416,121 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; GFX90A-LABEL: test_mfma_f32_32x32x4f16: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 ; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX90A-NEXT: v_mov_b32_e32 v16, s0 +; GFX90A-NEXT: v_mov_b32_e32 v17, s1 +; GFX90A-NEXT: v_mov_b32_e32 v18, s2 +; GFX90A-NEXT: v_mov_b32_e32 v19, s3 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v3, s19 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 -; GFX90A-NEXT: v_mov_b32_e32 v4, s2 -; GFX90A-NEXT: v_mov_b32_e32 v5, s3 +; GFX90A-NEXT: v_mov_b32_e32 v34, s0 +; GFX90A-NEXT: v_mov_b32_e32 v35, s1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s20 +; GFX90A-NEXT: v_mov_b32_e32 v5, s21 +; GFX90A-NEXT: v_mov_b32_e32 v6, s22 +; GFX90A-NEXT: v_mov_b32_e32 v7, s23 +; GFX90A-NEXT: v_mov_b32_e32 v8, s24 +; GFX90A-NEXT: v_mov_b32_e32 v9, s25 +; GFX90A-NEXT: v_mov_b32_e32 v10, s26 +; GFX90A-NEXT: v_mov_b32_e32 v11, s27 +; GFX90A-NEXT: v_mov_b32_e32 v12, s28 +; GFX90A-NEXT: v_mov_b32_e32 v13, s29 +; GFX90A-NEXT: v_mov_b32_e32 v14, s30 +; GFX90A-NEXT: v_mov_b32_e32 v15, s31 +; GFX90A-NEXT: v_mov_b32_e32 v20, s4 +; GFX90A-NEXT: v_mov_b32_e32 v21, s5 +; GFX90A-NEXT: v_mov_b32_e32 v22, s6 +; GFX90A-NEXT: v_mov_b32_e32 v23, s7 +; GFX90A-NEXT: v_mov_b32_e32 v24, s8 +; GFX90A-NEXT: v_mov_b32_e32 v25, s9 +; GFX90A-NEXT: v_mov_b32_e32 v26, s10 +; GFX90A-NEXT: v_mov_b32_e32 v27, s11 +; GFX90A-NEXT: v_mov_b32_e32 v28, s12 +; GFX90A-NEXT: v_mov_b32_e32 v29, s13 +; GFX90A-NEXT: v_mov_b32_e32 v30, s14 +; GFX90A-NEXT: v_mov_b32_e32 v31, s15 +; GFX90A-NEXT: v_mov_b32_e32 v36, s2 +; GFX90A-NEXT: v_mov_b32_e32 v37, s3 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_32x32x4f16 v[0:31], v[34:35], v[36:37], v[0:31] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37] -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[36:37] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[36:37] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[36:37] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[36:37] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[36:37] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[36:37] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[36:37] +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[36:37] offset:16 ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x4f16: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX942-NEXT: v_mov_b32_e32 v16, s0 +; GFX942-NEXT: v_mov_b32_e32 v17, s1 +; GFX942-NEXT: v_mov_b32_e32 v18, s2 +; GFX942-NEXT: v_mov_b32_e32 v19, s3 ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: v_mov_b32_e32 v1, s17 +; GFX942-NEXT: v_mov_b32_e32 v2, s18 +; GFX942-NEXT: v_mov_b32_e32 v3, s19 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-NEXT: v_mov_b32_e32 v3, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 -; GFX942-NEXT: v_mov_b32_e32 v4, s2 -; GFX942-NEXT: v_mov_b32_e32 v5, s3 +; GFX942-NEXT: v_mov_b32_e32 v34, s0 +; GFX942-NEXT: v_mov_b32_e32 v35, s1 +; GFX942-NEXT: v_mov_b32_e32 v4, s20 +; GFX942-NEXT: v_mov_b32_e32 v5, s21 +; GFX942-NEXT: v_mov_b32_e32 v6, s22 +; GFX942-NEXT: v_mov_b32_e32 v7, s23 +; GFX942-NEXT: v_mov_b32_e32 v8, s24 +; GFX942-NEXT: v_mov_b32_e32 v9, s25 +; GFX942-NEXT: v_mov_b32_e32 v10, s26 +; GFX942-NEXT: v_mov_b32_e32 v11, s27 +; GFX942-NEXT: v_mov_b32_e32 v12, s28 +; GFX942-NEXT: v_mov_b32_e32 v13, s29 +; GFX942-NEXT: v_mov_b32_e32 v14, s30 +; GFX942-NEXT: v_mov_b32_e32 v15, s31 +; GFX942-NEXT: v_mov_b32_e32 v20, s4 +; GFX942-NEXT: v_mov_b32_e32 v21, s5 +; GFX942-NEXT: v_mov_b32_e32 v22, s6 +; GFX942-NEXT: v_mov_b32_e32 v23, s7 +; GFX942-NEXT: v_mov_b32_e32 v24, s8 +; GFX942-NEXT: v_mov_b32_e32 v25, s9 +; GFX942-NEXT: v_mov_b32_e32 v26, s10 +; GFX942-NEXT: v_mov_b32_e32 v27, s11 +; GFX942-NEXT: v_mov_b32_e32 v28, s12 +; GFX942-NEXT: v_mov_b32_e32 v29, s13 +; GFX942-NEXT: v_mov_b32_e32 v30, s14 +; GFX942-NEXT: v_mov_b32_e32 v31, s15 +; GFX942-NEXT: v_mov_b32_e32 v36, s2 +; GFX942-NEXT: v_mov_b32_e32 v37, s3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 v[0:31], v[34:35], v[36:37], v[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37] -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[36:37] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[36:37] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[36:37] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[36:37] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[36:37] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[36:37] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[36:37] +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[36:37] offset:16 ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x4f16: @@ -1790,34 +1750,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; GFX90A-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s20 -; GFX90A-NEXT: v_mov_b32_e32 v1, s21 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_mov_b32_e32 v2, s22 -; GFX90A-NEXT: v_mov_b32_e32 v3, s23 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: v_mov_b32_e32 v16, s20 +; GFX90A-NEXT: v_mov_b32_e32 v17, s21 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v18, s22 +; GFX90A-NEXT: v_mov_b32_e32 v19, s23 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f32_16x16x4f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: s_nop 9 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x4f16: @@ -1827,34 +1779,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; GFX942-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s20 -; GFX942-NEXT: v_mov_b32_e32 v1, s21 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_mov_b32_e32 v2, s22 -; GFX942-NEXT: v_mov_b32_e32 v3, s23 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: v_mov_b32_e32 v16, s20 +; GFX942-NEXT: v_mov_b32_e32 v17, s21 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v18, s22 +; GFX942-NEXT: v_mov_b32_e32 v19, s23 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_16x16x4_4b_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f32_16x16x4_4b_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: s_nop 9 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4f16: @@ -1961,45 +1905,41 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add ; GFX90A-LABEL: test_mfma_f32_4x4x4f16: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: v_mov_b32_e32 v3, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 -; GFX90A-NEXT: v_mov_b32_e32 v4, s6 -; GFX90A-NEXT: v_mov_b32_e32 v5, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: v_mov_b32_e32 v7, s5 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v9, s7 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_4x4x4f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 4 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x4f16: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: v_mov_b32_e32 v3, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 -; GFX942-NEXT: v_mov_b32_e32 v4, s6 -; GFX942-NEXT: v_mov_b32_e32 v5, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s11 +; GFX942-NEXT: v_mov_b32_e32 v6, s4 +; GFX942-NEXT: v_mov_b32_e32 v7, s5 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX942-NEXT: v_mov_b32_e32 v8, s6 +; GFX942-NEXT: v_mov_b32_e32 v9, s7 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 4 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16: @@ -2179,35 +2119,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; GFX90A-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s20 -; GFX90A-NEXT: v_mov_b32_e32 v1, s21 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_mov_b32_e32 v2, s22 -; GFX90A-NEXT: v_mov_b32_e32 v3, s23 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: v_mov_b32_e32 v16, s20 +; GFX90A-NEXT: v_mov_b32_e32 v17, s21 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v18, s22 +; GFX90A-NEXT: v_mov_b32_e32 v19, s23 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x8f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x8f16: @@ -2217,34 +2149,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; GFX942-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s20 -; GFX942-NEXT: v_mov_b32_e32 v1, s21 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_mov_b32_e32 v2, s22 -; GFX942-NEXT: v_mov_b32_e32 v3, s23 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: v_mov_b32_e32 v16, s20 +; GFX942-NEXT: v_mov_b32_e32 v17, s21 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v18, s22 +; GFX942-NEXT: v_mov_b32_e32 v19, s23 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: s_nop 9 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16: @@ -2351,45 +2275,41 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; GFX90A-LABEL: test_mfma_f32_16x16x16f16: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: v_mov_b32_e32 v3, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 -; GFX90A-NEXT: v_mov_b32_e32 v4, s6 -; GFX90A-NEXT: v_mov_b32_e32 v5, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: v_mov_b32_e32 v7, s5 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v9, s7 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_16x16x16f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 10 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x16f16: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: v_mov_b32_e32 v3, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 -; GFX942-NEXT: v_mov_b32_e32 v4, s6 -; GFX942-NEXT: v_mov_b32_e32 v5, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s11 +; GFX942-NEXT: v_mov_b32_e32 v6, s4 +; GFX942-NEXT: v_mov_b32_e32 v7, s5 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX942-NEXT: v_mov_b32_e32 v8, s6 +; GFX942-NEXT: v_mov_b32_e32 v9, s7 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 6 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16: @@ -2667,113 +2587,113 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; GFX90A-LABEL: test_mfma_i32_32x32x4i8: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 1 +; GFX90A-NEXT: v_mov_b32_e32 v34, 2 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v3, s19 +; GFX90A-NEXT: v_mov_b32_e32 v4, s20 +; GFX90A-NEXT: v_mov_b32_e32 v5, s21 +; GFX90A-NEXT: v_mov_b32_e32 v6, s22 +; GFX90A-NEXT: v_mov_b32_e32 v7, s23 +; GFX90A-NEXT: v_mov_b32_e32 v8, s24 +; GFX90A-NEXT: v_mov_b32_e32 v9, s25 +; GFX90A-NEXT: v_mov_b32_e32 v10, s26 +; GFX90A-NEXT: v_mov_b32_e32 v11, s27 +; GFX90A-NEXT: v_mov_b32_e32 v12, s28 +; GFX90A-NEXT: v_mov_b32_e32 v13, s29 +; GFX90A-NEXT: v_mov_b32_e32 v14, s30 +; GFX90A-NEXT: v_mov_b32_e32 v15, s31 +; GFX90A-NEXT: v_mov_b32_e32 v16, s0 +; GFX90A-NEXT: v_mov_b32_e32 v17, s1 +; GFX90A-NEXT: v_mov_b32_e32 v18, s2 +; GFX90A-NEXT: v_mov_b32_e32 v19, s3 +; GFX90A-NEXT: v_mov_b32_e32 v20, s4 +; GFX90A-NEXT: v_mov_b32_e32 v21, s5 +; GFX90A-NEXT: v_mov_b32_e32 v22, s6 +; GFX90A-NEXT: v_mov_b32_e32 v23, s7 +; GFX90A-NEXT: v_mov_b32_e32 v24, s8 +; GFX90A-NEXT: v_mov_b32_e32 v25, s9 +; GFX90A-NEXT: v_mov_b32_e32 v26, s10 +; GFX90A-NEXT: v_mov_b32_e32 v27, s11 +; GFX90A-NEXT: v_mov_b32_e32 v28, s12 +; GFX90A-NEXT: v_mov_b32_e32 v29, s13 +; GFX90A-NEXT: v_mov_b32_e32 v30, s14 +; GFX90A-NEXT: v_mov_b32_e32 v31, s15 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_i32_32x32x4i8 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_i32_32x32x4i8: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v33, 1 +; GFX942-NEXT: v_mov_b32_e32 v34, 2 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: v_mov_b32_e32 v1, s17 +; GFX942-NEXT: v_mov_b32_e32 v2, s18 +; GFX942-NEXT: v_mov_b32_e32 v3, s19 +; GFX942-NEXT: v_mov_b32_e32 v4, s20 +; GFX942-NEXT: v_mov_b32_e32 v5, s21 +; GFX942-NEXT: v_mov_b32_e32 v6, s22 +; GFX942-NEXT: v_mov_b32_e32 v7, s23 +; GFX942-NEXT: v_mov_b32_e32 v8, s24 +; GFX942-NEXT: v_mov_b32_e32 v9, s25 +; GFX942-NEXT: v_mov_b32_e32 v10, s26 +; GFX942-NEXT: v_mov_b32_e32 v11, s27 +; GFX942-NEXT: v_mov_b32_e32 v12, s28 +; GFX942-NEXT: v_mov_b32_e32 v13, s29 +; GFX942-NEXT: v_mov_b32_e32 v14, s30 +; GFX942-NEXT: v_mov_b32_e32 v15, s31 +; GFX942-NEXT: v_mov_b32_e32 v16, s0 +; GFX942-NEXT: v_mov_b32_e32 v17, s1 +; GFX942-NEXT: v_mov_b32_e32 v18, s2 +; GFX942-NEXT: v_mov_b32_e32 v19, s3 +; GFX942-NEXT: v_mov_b32_e32 v20, s4 +; GFX942-NEXT: v_mov_b32_e32 v21, s5 +; GFX942-NEXT: v_mov_b32_e32 v22, s6 +; GFX942-NEXT: v_mov_b32_e32 v23, s7 +; GFX942-NEXT: v_mov_b32_e32 v24, s8 +; GFX942-NEXT: v_mov_b32_e32 v25, s9 +; GFX942-NEXT: v_mov_b32_e32 v26, s10 +; GFX942-NEXT: v_mov_b32_e32 v27, s11 +; GFX942-NEXT: v_mov_b32_e32 v28, s12 +; GFX942-NEXT: v_mov_b32_e32 v29, s13 +; GFX942-NEXT: v_mov_b32_e32 v30, s14 +; GFX942-NEXT: v_mov_b32_e32 v31, s15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_i32_32x32x4_2b_i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_i32_32x32x4_2b_i8 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_i32_32x32x4i8: @@ -2976,69 +2896,53 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { ; GFX90A-LABEL: test_mfma_i32_16x16x4i8: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2 +; GFX90A-NEXT: v_mov_b32_e32 v16, 1 +; GFX90A-NEXT: v_mov_b32_e32 v17, 2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_i32_16x16x4i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: s_nop 9 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_i32_16x16x4i8: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 1 -; GFX942-NEXT: v_mov_b32_e32 v1, 2 +; GFX942-NEXT: v_mov_b32_e32 v16, 1 +; GFX942-NEXT: v_mov_b32_e32 v17, 2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: s_nop 9 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8: @@ -3157,15 +3061,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac ; GFX90A-NEXT: v_mov_b32_e32 v0, 1 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 8 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_nop 9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64: @@ -3173,15 +3076,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac ; GFX942-NEXT: v_mov_b32_e32 v0, 1 ; GFX942-NEXT: v_mov_b32_e32 v1, 2 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 8 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_nop 9 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64: @@ -3265,39 +3167,35 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 { ; GFX90A-LABEL: test_mfma_i32_4x4x4i8: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 2 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_i32_4x4x4i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 4 -; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_i32_4x4x4i8: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 1 -; GFX942-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 1 +; GFX942-NEXT: v_mov_b32_e32 v6, 2 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 4 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8: @@ -3368,26 +3266,26 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1(ptr addrspace( ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v0, 1 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 3 -; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v0, 1 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 3 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1: @@ -3458,38 +3356,36 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1) ; ; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-NEXT: v_mov_b32_e32 v5, 1 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x41 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_i32_4x4x4i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 3 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1: ; GFX942: ; %bb.0: -; GFX942-NEXT: v_mov_b32_e32 v1, 0x41 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-NEXT: v_mov_b32_e32 v5, 1 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x41 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, 2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 3 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1: @@ -3790,115 +3686,115 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; GFX90A-LABEL: test_mfma_f32_32x32x1f32_forward_acc: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v3, s19 +; GFX90A-NEXT: v_mov_b32_e32 v4, s20 +; GFX90A-NEXT: v_mov_b32_e32 v5, s21 +; GFX90A-NEXT: v_mov_b32_e32 v6, s22 +; GFX90A-NEXT: v_mov_b32_e32 v7, s23 +; GFX90A-NEXT: v_mov_b32_e32 v8, s24 +; GFX90A-NEXT: v_mov_b32_e32 v9, s25 +; GFX90A-NEXT: v_mov_b32_e32 v10, s26 +; GFX90A-NEXT: v_mov_b32_e32 v11, s27 +; GFX90A-NEXT: v_mov_b32_e32 v12, s28 +; GFX90A-NEXT: v_mov_b32_e32 v13, s29 +; GFX90A-NEXT: v_mov_b32_e32 v14, s30 +; GFX90A-NEXT: v_mov_b32_e32 v15, s31 +; GFX90A-NEXT: v_mov_b32_e32 v16, s0 +; GFX90A-NEXT: v_mov_b32_e32 v17, s1 +; GFX90A-NEXT: v_mov_b32_e32 v18, s2 +; GFX90A-NEXT: v_mov_b32_e32 v19, s3 +; GFX90A-NEXT: v_mov_b32_e32 v20, s4 +; GFX90A-NEXT: v_mov_b32_e32 v21, s5 +; GFX90A-NEXT: v_mov_b32_e32 v22, s6 +; GFX90A-NEXT: v_mov_b32_e32 v23, s7 +; GFX90A-NEXT: v_mov_b32_e32 v24, s8 +; GFX90A-NEXT: v_mov_b32_e32 v25, s9 +; GFX90A-NEXT: v_mov_b32_e32 v26, s10 +; GFX90A-NEXT: v_mov_b32_e32 v27, s11 +; GFX90A-NEXT: v_mov_b32_e32 v28, s12 +; GFX90A-NEXT: v_mov_b32_e32 v29, s13 +; GFX90A-NEXT: v_mov_b32_e32 v30, s14 +; GFX90A-NEXT: v_mov_b32_e32 v31, s15 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31] +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x1f32_forward_acc: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v33, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: v_mov_b32_e32 v1, s17 +; GFX942-NEXT: v_mov_b32_e32 v2, s18 +; GFX942-NEXT: v_mov_b32_e32 v3, s19 +; GFX942-NEXT: v_mov_b32_e32 v4, s20 +; GFX942-NEXT: v_mov_b32_e32 v5, s21 +; GFX942-NEXT: v_mov_b32_e32 v6, s22 +; GFX942-NEXT: v_mov_b32_e32 v7, s23 +; GFX942-NEXT: v_mov_b32_e32 v8, s24 +; GFX942-NEXT: v_mov_b32_e32 v9, s25 +; GFX942-NEXT: v_mov_b32_e32 v10, s26 +; GFX942-NEXT: v_mov_b32_e32 v11, s27 +; GFX942-NEXT: v_mov_b32_e32 v12, s28 +; GFX942-NEXT: v_mov_b32_e32 v13, s29 +; GFX942-NEXT: v_mov_b32_e32 v14, s30 +; GFX942-NEXT: v_mov_b32_e32 v15, s31 +; GFX942-NEXT: v_mov_b32_e32 v16, s0 +; GFX942-NEXT: v_mov_b32_e32 v17, s1 +; GFX942-NEXT: v_mov_b32_e32 v18, s2 +; GFX942-NEXT: v_mov_b32_e32 v19, s3 +; GFX942-NEXT: v_mov_b32_e32 v20, s4 +; GFX942-NEXT: v_mov_b32_e32 v21, s5 +; GFX942-NEXT: v_mov_b32_e32 v22, s6 +; GFX942-NEXT: v_mov_b32_e32 v23, s7 +; GFX942-NEXT: v_mov_b32_e32 v24, s8 +; GFX942-NEXT: v_mov_b32_e32 v25, s9 +; GFX942-NEXT: v_mov_b32_e32 v26, s10 +; GFX942-NEXT: v_mov_b32_e32 v27, s11 +; GFX942-NEXT: v_mov_b32_e32 v28, s12 +; GFX942-NEXT: v_mov_b32_e32 v29, s13 +; GFX942-NEXT: v_mov_b32_e32 v30, s14 +; GFX942-NEXT: v_mov_b32_e32 v31, s15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_forward_acc: @@ -4103,71 +3999,55 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; GFX90A-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v17, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v16, v17, v[0:15] +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v16, v17, v[0:15] +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: s_nop 9 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v16, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v17, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] -; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: s_nop 8 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_forward_acc: @@ -4266,42 +4146,38 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) % ; GFX90A-LABEL: test_mfma_f32_4x4x1f32_forward_acc: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] -; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v4, v5, v[0:3] +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v4, v5, v[0:3] ; GFX90A-NEXT: s_nop 4 -; GFX90A-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x1f32_forward_acc: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3] ; GFX942-NEXT: s_nop 3 -; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_forward_acc: @@ -4375,26 +4251,26 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %ar ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v2, 1.0 +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 3 -; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x1f32_imm_splat: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v2, 1.0 +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_imm_splat: @@ -4509,15 +4385,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 8 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_nop 9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x1f32_imm_splat: @@ -4525,15 +4400,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) % ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_nop 8 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat: @@ -4657,16 +4531,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40004000 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 15 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x8f16_imm_splat: @@ -4676,15 +4549,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) % ; GFX942-NEXT: v_mov_b32_e32 v2, 0x40004000 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], 1.0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 8 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_nop 9 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat: @@ -4870,20 +4742,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 15 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x1f32_imm_splat: @@ -4891,19 +4762,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 15 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm_splat: @@ -4978,36 +4849,32 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 ; ; GFX90A-LABEL: test_mfma_f32_4x4x1f32_imm: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 2.0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[0:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 3 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x1f32_imm: ; GFX942: ; %bb.0: ; %bb -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 2.0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v2, a[0:3] +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_imm: @@ -5141,64 +5008,60 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; ; GFX90A-LABEL: test_mfma_f32_16x16x1f32_imm: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 2.0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v15, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v0, v15, v[0:15] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 9 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x1f32_imm: ; GFX942: ; %bb.0: ; %bb -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 2.0 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v15, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v2, a[0:15] +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v15, v[0:15] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 8 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm: @@ -5436,106 +5299,136 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; ; GFX90A-LABEL: test_mfma_f32_32x32x1f32_imm: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a1 -; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a1 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v1 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, v1 +; GFX90A-NEXT: v_mov_b32_e32 v13, v1 +; GFX90A-NEXT: v_mov_b32_e32 v14, v1 +; GFX90A-NEXT: v_mov_b32_e32 v15, v1 +; GFX90A-NEXT: v_mov_b32_e32 v16, v1 +; GFX90A-NEXT: v_mov_b32_e32 v17, v1 +; GFX90A-NEXT: v_mov_b32_e32 v18, v1 +; GFX90A-NEXT: v_mov_b32_e32 v19, v1 +; GFX90A-NEXT: v_mov_b32_e32 v20, v1 +; GFX90A-NEXT: v_mov_b32_e32 v21, v1 +; GFX90A-NEXT: v_mov_b32_e32 v22, v1 +; GFX90A-NEXT: v_mov_b32_e32 v23, v1 +; GFX90A-NEXT: v_mov_b32_e32 v24, v1 +; GFX90A-NEXT: v_mov_b32_e32 v25, v1 +; GFX90A-NEXT: v_mov_b32_e32 v26, v1 +; GFX90A-NEXT: v_mov_b32_e32 v27, v1 +; GFX90A-NEXT: v_mov_b32_e32 v28, v1 +; GFX90A-NEXT: v_mov_b32_e32 v29, v1 +; GFX90A-NEXT: v_mov_b32_e32 v30, v1 +; GFX90A-NEXT: v_mov_b32_e32 v31, v1 +; GFX90A-NEXT: v_pk_mov_b32 v[32:33], v[30:31], v[30:31] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v34, 2.0 +; GFX90A-NEXT: v_pk_mov_b32 v[30:31], v[28:29], v[28:29] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[28:29], v[26:27], v[26:27] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[26:27], v[24:25], v[24:25] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[24:25], v[22:23], v[22:23] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[22:23], v[20:21], v[20:21] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[20:21], v[18:19], v[18:19] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[18:19], v[16:17], v[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[16:17], v[14:15], v[14:15] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[14:15], v[12:13], v[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[10:11], v[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v0, v34, v[2:33] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v1, v[22:25], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v1, v[18:21], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v1, v[14:17], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v1, v[10:13], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x1f32_imm: ; GFX942: ; %bb.0: ; %bb -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a8, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a9, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a10, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a11, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a12, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a13, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a14, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a15, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a16, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a17, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a18, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a19, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a20, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a21, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a22, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a23, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a24, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a25, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a26, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a27, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a28, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a29, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a30, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a31, a1 -; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v10, v1 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: v_mov_b32_e32 v12, v1 +; GFX942-NEXT: v_mov_b32_e32 v13, v1 +; GFX942-NEXT: v_mov_b32_e32 v14, v1 +; GFX942-NEXT: v_mov_b32_e32 v15, v1 +; GFX942-NEXT: v_mov_b32_e32 v16, v1 +; GFX942-NEXT: v_mov_b32_e32 v17, v1 +; GFX942-NEXT: v_mov_b32_e32 v18, v1 +; GFX942-NEXT: v_mov_b32_e32 v19, v1 +; GFX942-NEXT: v_mov_b32_e32 v20, v1 +; GFX942-NEXT: v_mov_b32_e32 v21, v1 +; GFX942-NEXT: v_mov_b32_e32 v22, v1 +; GFX942-NEXT: v_mov_b32_e32 v23, v1 +; GFX942-NEXT: v_mov_b32_e32 v24, v1 +; GFX942-NEXT: v_mov_b32_e32 v25, v1 +; GFX942-NEXT: v_mov_b32_e32 v26, v1 +; GFX942-NEXT: v_mov_b32_e32 v27, v1 +; GFX942-NEXT: v_mov_b32_e32 v28, v1 +; GFX942-NEXT: v_mov_b32_e32 v29, v1 +; GFX942-NEXT: v_mov_b32_e32 v30, v1 +; GFX942-NEXT: v_mov_b32_e32 v31, v1 +; GFX942-NEXT: v_mov_b64_e32 v[32:33], v[30:31] +; GFX942-NEXT: v_mov_b32_e32 v34, 2.0 +; GFX942-NEXT: v_mov_b64_e32 v[30:31], v[28:29] +; GFX942-NEXT: v_mov_b64_e32 v[28:29], v[26:27] +; GFX942-NEXT: v_mov_b64_e32 v[26:27], v[24:25] +; GFX942-NEXT: v_mov_b64_e32 v[24:25], v[22:23] +; GFX942-NEXT: v_mov_b64_e32 v[22:23], v[20:21] +; GFX942-NEXT: v_mov_b64_e32 v[20:21], v[18:19] +; GFX942-NEXT: v_mov_b64_e32 v[18:19], v[16:17] +; GFX942-NEXT: v_mov_b64_e32 v[16:17], v[14:15] +; GFX942-NEXT: v_mov_b64_e32 v[14:15], v[12:13] +; GFX942-NEXT: v_mov_b64_e32 v[12:13], v[10:11] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], v[8:9] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v1, v[22:25], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v1, v[18:21], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v1, v[14:17], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v1, v[10:13], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm: @@ -5659,38 +5552,38 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar ; ; GFX90A-LABEL: test_mfma_f32_4x4x1f32_lit_splat: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f60000 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v5, v6, v[0:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 3 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x1f32_lit_splat: ; GFX942: ; %bb.0: ; %bb -; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f60000 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v2, a[0:3] +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat: @@ -5768,38 +5661,36 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa ; ; GFX90A-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f60000 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v5, v6, v[0:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 3 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: ; GFX942: ; %bb.0: ; %bb -; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f60000 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v2, a[0:3] +; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: @@ -6022,60 +5913,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v34, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 -; GFX90A-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 -; GFX90A-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 -; GFX90A-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 -; GFX90A-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 -; GFX90A-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 -; GFX90A-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 -; GFX90A-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] +; GFX90A-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; GFX90A-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; GFX90A-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; GFX90A-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; GFX90A-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GFX90A-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; GFX90A-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; GFX90A-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x1f32_vecarg: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 +; GFX942-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v34, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 -; GFX942-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 -; GFX942-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 -; GFX942-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 -; GFX942-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 -; GFX942-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 -; GFX942-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 -; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] +; GFX942-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; GFX942-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; GFX942-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; GFX942-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; GFX942-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GFX942-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; GFX942-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_vecarg: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index aae14c8..d9359c0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -17,17 +17,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -40,17 +30,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -63,17 +43,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -86,17 +56,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -109,17 +69,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -132,17 +82,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -155,17 +95,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -178,17 +108,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -202,17 +122,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -226,17 +136,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] blgp:1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -250,17 +150,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] blgp:1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -274,17 +164,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] blgp:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -298,17 +178,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17] blgp:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -322,17 +192,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] blgp:3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -346,17 +206,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17] blgp:3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -370,17 +220,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -394,17 +234,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:11], v[12:15] blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -418,17 +248,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz @@ -442,17 +262,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] cbsz:1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz @@ -466,17 +276,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz @@ -491,17 +291,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] cbsz:1 blgp:1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz @@ -515,17 +305,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz @@ -538,17 +318,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17] cbsz:1 blgp:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz @@ -562,17 +332,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz @@ -586,17 +346,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17] cbsz:1 blgp:3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz @@ -610,17 +360,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz @@ -634,17 +374,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:11], v[12:15] cbsz:1 blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 1, ; cbsz @@ -658,17 +388,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz @@ -682,17 +402,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17] cbsz:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz @@ -706,17 +416,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz @@ -730,17 +430,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17] cbsz:2 blgp:1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz @@ -754,17 +444,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz @@ -778,17 +458,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15] cbsz:2 blgp:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz @@ -802,17 +472,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz @@ -826,17 +486,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15] cbsz:2 blgp:3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz @@ -851,17 +501,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz @@ -875,17 +515,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17] cbsz:3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz @@ -899,17 +529,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz @@ -923,17 +543,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17] cbsz:3 blgp:1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz @@ -947,17 +557,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz @@ -971,17 +571,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15] cbsz:3 blgp:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz @@ -995,17 +585,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz @@ -1019,17 +599,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:9], v[10:13] cbsz:3 blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz @@ -1043,17 +613,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz @@ -1067,17 +627,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15] cbsz:3 blgp:3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 3, ; cbsz @@ -1091,17 +641,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz @@ -1115,17 +655,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:9], v[10:13] cbsz:2 blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz @@ -1139,17 +669,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz @@ -1163,17 +683,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:11], v[12:15] cbsz:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz @@ -1187,17 +697,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz @@ -1211,17 +711,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:11], v[12:15] cbsz:4 blgp:1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz @@ -1235,17 +725,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz @@ -1259,17 +739,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:9], v[10:13] cbsz:4 blgp:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz @@ -1283,17 +753,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz @@ -1307,17 +767,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:9], v[10:13] cbsz:4 blgp:3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz @@ -1331,17 +781,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:7], v[8:11], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz @@ -1355,17 +795,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:4 blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz @@ -1382,19 +812,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_mov_b32_e32 v16, s0 -; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: v_mov_b32_e32 v20, s0 +; GCN-NEXT: v_mov_b32_e32 v21, s1 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result @@ -1404,18 +825,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v21, s0 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[0,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result @@ -1425,18 +837,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v21, s0 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result @@ -1446,35 +849,28 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v14, s0 -; SDAG-NEXT: v_mov_b32_e32 v15, s1 -; SDAG-NEXT: v_mov_b32_e32 v16, s2 -; SDAG-NEXT: v_mov_b32_e32 v17, s3 -; SDAG-NEXT: v_mov_b32_e32 v18, s16 -; SDAG-NEXT: v_mov_b32_e32 v19, s17 -; SDAG-NEXT: v_mov_b32_e32 v20, s18 -; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: v_mov_b32_e32 v17, s1 +; SDAG-NEXT: v_mov_b32_e32 v18, s2 +; SDAG-NEXT: v_mov_b32_e32 v19, s3 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v4, s28 ; SDAG-NEXT: v_mov_b32_e32 v5, s29 -; SDAG-NEXT: v_mov_b32_e32 v6, s20 -; SDAG-NEXT: v_mov_b32_e32 v7, s21 -; SDAG-NEXT: v_mov_b32_e32 v8, s22 -; SDAG-NEXT: v_mov_b32_e32 v9, s23 -; SDAG-NEXT: v_mov_b32_e32 v10, s24 -; SDAG-NEXT: v_mov_b32_e32 v11, s25 -; SDAG-NEXT: v_mov_b32_e32 v12, s26 -; SDAG-NEXT: v_mov_b32_e32 v13, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v4 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v5 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[16:23], v[8:15], v[4:7], v2, v3 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: @@ -1488,23 +884,16 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b32_e32 v22, v0 +; GISEL-NEXT: v_mov_b32_e32 v23, v1 ; GISEL-NEXT: v_mov_b32_e32 v20, s28 ; GISEL-NEXT: v_mov_b32_e32 v21, s29 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, v3 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result @@ -1522,18 +911,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[8:11], v13, v12 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: @@ -1547,18 +927,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_mov_b32_e32 v8, s20 +; GISEL-NEXT: v_mov_b32_e32 v13, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[8:11], v13, v12 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result @@ -1576,18 +947,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[8:11], v12, v13 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: @@ -1601,18 +963,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_mov_b32_e32 v8, s20 +; GISEL-NEXT: v_mov_b32_e32 v13, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[8:11], v12, v13 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result @@ -1630,18 +983,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[14:21], v[8:11], v12, v13 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: @@ -1655,40 +999,36 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_mov_b32_e32 v8, s20 +; GISEL-NEXT: v_mov_b32_e32 v13, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[14:21], v[8:11], v12, v13 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_mov_b32_e32 v17, s16 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v18, s0 +; SDAG-NEXT: v_mov_b32_e32 v19, s1 +; SDAG-NEXT: v_mov_b32_e32 v20, s2 +; SDAG-NEXT: v_mov_b32_e32 v21, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[18:21], v16, v17 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v17, s16 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[18:21], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result } @@ -1697,26 +1037,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s0 -; SDAG-NEXT: v_mov_b32_e32 v11, s1 -; SDAG-NEXT: v_mov_b32_e32 v12, s2 -; SDAG-NEXT: v_mov_b32_e32 v13, s3 -; SDAG-NEXT: v_mov_b32_e32 v14, s16 -; SDAG-NEXT: v_mov_b32_e32 v15, s17 -; SDAG-NEXT: v_mov_b32_e32 v16, s18 -; SDAG-NEXT: v_mov_b32_e32 v17, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s23 +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 ; SDAG-NEXT: v_mov_b32_e32 v9, s24 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[10:13], v8, v9 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: @@ -1730,18 +1065,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a1, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s23 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v9, s24 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[10:17], v[0:7], v[18:21], v8, v9 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <4 x float> %result @@ -1753,35 +1081,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v20, -2 ; SDAG-NEXT: v_mov_b32_e32 v21, 33 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_mov_b32_e32 v16, 33 -; GISEL-NEXT: v_mov_b32_e32 v17, -2 +; GISEL-NEXT: v_mov_b32_e32 v20, 33 +; GISEL-NEXT: v_mov_b32_e32 v21, -2 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) ret <4 x float> %result @@ -1793,35 +1103,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v20, -2 ; SDAG-NEXT: v_mov_b32_e32 v21, 0x41 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v17, -2 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v21, -2 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) ret <4 x float> %result @@ -1833,35 +1125,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v20, 0x4d ; SDAG-NEXT: v_mov_b32_e32 v21, 0x41 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d +; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v21, 0x4d ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) ret <4 x float> %result @@ -2188,17 +1462,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a( ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %result @@ -2209,17 +1473,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b( ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) ret <4 x float> %result @@ -2231,35 +1485,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v20, 1 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GISEL-NEXT: v_mov_b32_e32 v20, 0 +; GISEL-NEXT: v_mov_b32_e32 v21, 1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) ret <4 x float> %result @@ -2271,35 +1507,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a( ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 1 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_mov_b32_e32 v16, 1 -; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: v_mov_b32_e32 v20, 1 +; GISEL-NEXT: v_mov_b32_e32 v21, 0 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) ret <4 x float> %result @@ -2313,17 +1531,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6( ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] blgp:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -2336,17 +1544,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8( ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz @@ -2359,17 +1557,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6( ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz @@ -2382,17 +1570,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_ ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] cbsz:2 blgp:2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 2, ; cbsz @@ -2405,17 +1583,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4( ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -2428,17 +1596,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz @@ -2451,17 +1609,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 0, ; cbsz @@ -2474,17 +1622,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz @@ -2497,17 +1635,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4( ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz @@ -2520,17 +1648,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4_ ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] cbsz:4 blgp:4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 4, ; cbsz diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index f0205a3..978284e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -17,89 +17,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -112,89 +80,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -207,89 +143,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -302,89 +206,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -397,89 +269,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[0,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -492,89 +332,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -587,89 +395,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -682,89 +458,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -778,43 +522,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: scratch_load_dword v31, off, s32 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: v_mov_b32_e32 v4, v20 +; GCN-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NEXT: v_mov_b32_e32 v6, v22 +; GCN-NEXT: v_mov_b32_e32 v7, v23 +; GCN-NEXT: v_mov_b32_e32 v8, v24 +; GCN-NEXT: v_mov_b32_e32 v9, v25 +; GCN-NEXT: v_mov_b32_e32 v10, v26 +; GCN-NEXT: v_mov_b32_e32 v11, v27 +; GCN-NEXT: v_mov_b32_e32 v12, v28 +; GCN-NEXT: v_mov_b32_e32 v13, v29 +; GCN-NEXT: v_mov_b32_e32 v14, v30 +; GCN-NEXT: v_mov_b32_e32 v15, v31 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -828,89 +556,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] blgp:1 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] blgp:1 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -923,43 +619,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: scratch_load_dword v31, off, s32 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] blgp:1 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: v_mov_b32_e32 v4, v20 +; GCN-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NEXT: v_mov_b32_e32 v6, v22 +; GCN-NEXT: v_mov_b32_e32 v7, v23 +; GCN-NEXT: v_mov_b32_e32 v8, v24 +; GCN-NEXT: v_mov_b32_e32 v9, v25 +; GCN-NEXT: v_mov_b32_e32 v10, v26 +; GCN-NEXT: v_mov_b32_e32 v11, v27 +; GCN-NEXT: v_mov_b32_e32 v12, v28 +; GCN-NEXT: v_mov_b32_e32 v13, v29 +; GCN-NEXT: v_mov_b32_e32 v14, v30 +; GCN-NEXT: v_mov_b32_e32 v15, v31 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -974,43 +654,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] blgp:2 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -1023,42 +686,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29] blgp:2 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -1073,43 +719,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] blgp:3 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -1122,42 +751,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29] blgp:3 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -1171,42 +783,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:7], v[8:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -1219,42 +814,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:7], v[8:11], v[12:27] blgp:4 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -1268,89 +846,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:1 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:1 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz @@ -1363,43 +909,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: scratch_load_dword v31, off, s32 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] cbsz:1 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: v_mov_b32_e32 v4, v20 +; GCN-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NEXT: v_mov_b32_e32 v6, v22 +; GCN-NEXT: v_mov_b32_e32 v7, v23 +; GCN-NEXT: v_mov_b32_e32 v8, v24 +; GCN-NEXT: v_mov_b32_e32 v9, v25 +; GCN-NEXT: v_mov_b32_e32 v10, v26 +; GCN-NEXT: v_mov_b32_e32 v11, v27 +; GCN-NEXT: v_mov_b32_e32 v12, v28 +; GCN-NEXT: v_mov_b32_e32 v13, v29 +; GCN-NEXT: v_mov_b32_e32 v14, v30 +; GCN-NEXT: v_mov_b32_e32 v15, v31 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz @@ -1413,89 +943,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:1 blgp:1 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz @@ -1509,43 +1007,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: scratch_load_dword v31, off, s32 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] cbsz:1 blgp:1 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: v_mov_b32_e32 v4, v20 +; GCN-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NEXT: v_mov_b32_e32 v6, v22 +; GCN-NEXT: v_mov_b32_e32 v7, v23 +; GCN-NEXT: v_mov_b32_e32 v8, v24 +; GCN-NEXT: v_mov_b32_e32 v9, v25 +; GCN-NEXT: v_mov_b32_e32 v10, v26 +; GCN-NEXT: v_mov_b32_e32 v11, v27 +; GCN-NEXT: v_mov_b32_e32 v12, v28 +; GCN-NEXT: v_mov_b32_e32 v13, v29 +; GCN-NEXT: v_mov_b32_e32 v14, v30 +; GCN-NEXT: v_mov_b32_e32 v15, v31 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz @@ -1560,43 +1042,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz @@ -1609,42 +1074,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29] cbsz:1 blgp:2 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz @@ -1659,43 +1107,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz @@ -1708,42 +1139,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29] cbsz:1 blgp:3 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz @@ -1757,42 +1171,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:7], v[8:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz @@ -1805,42 +1202,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:7], v[8:11], v[12:27] cbsz:1 blgp:4 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 1, ; cbsz @@ -1855,43 +1235,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:2 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz @@ -1904,42 +1267,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29] cbsz:2 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz @@ -1954,43 +1300,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz @@ -2003,42 +1332,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29] cbsz:2 blgp:1 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz @@ -2052,41 +1364,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz @@ -2099,41 +1394,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27] cbsz:2 blgp:2 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz @@ -2147,41 +1425,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz @@ -2194,41 +1455,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27] cbsz:2 blgp:3 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz @@ -2244,43 +1488,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:3 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz @@ -2293,42 +1520,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29] cbsz:3 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz @@ -2343,43 +1553,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz @@ -2392,42 +1585,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29] cbsz:3 blgp:1 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz @@ -2441,41 +1617,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz @@ -2488,41 +1647,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27] cbsz:3 blgp:2 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz @@ -2536,41 +1678,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[10:25], v[0:5], v[6:9], v[10:25], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v10 +; GCN-NEXT: v_mov_b32_e32 v1, v11 +; GCN-NEXT: v_mov_b32_e32 v2, v12 +; GCN-NEXT: v_mov_b32_e32 v3, v13 +; GCN-NEXT: v_mov_b32_e32 v4, v14 +; GCN-NEXT: v_mov_b32_e32 v5, v15 +; GCN-NEXT: v_mov_b32_e32 v6, v16 +; GCN-NEXT: v_mov_b32_e32 v7, v17 +; GCN-NEXT: v_mov_b32_e32 v8, v18 +; GCN-NEXT: v_mov_b32_e32 v9, v19 +; GCN-NEXT: v_mov_b32_e32 v10, v20 +; GCN-NEXT: v_mov_b32_e32 v11, v21 +; GCN-NEXT: v_mov_b32_e32 v12, v22 +; GCN-NEXT: v_mov_b32_e32 v13, v23 +; GCN-NEXT: v_mov_b32_e32 v14, v24 +; GCN-NEXT: v_mov_b32_e32 v15, v25 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz @@ -2583,41 +1708,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[10:25], v[0:5], v[6:9], v[10:25] cbsz:3 blgp:4 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v10 +; GCN-NEXT: v_mov_b32_e32 v1, v11 +; GCN-NEXT: v_mov_b32_e32 v2, v12 +; GCN-NEXT: v_mov_b32_e32 v3, v13 +; GCN-NEXT: v_mov_b32_e32 v4, v14 +; GCN-NEXT: v_mov_b32_e32 v5, v15 +; GCN-NEXT: v_mov_b32_e32 v6, v16 +; GCN-NEXT: v_mov_b32_e32 v7, v17 +; GCN-NEXT: v_mov_b32_e32 v8, v18 +; GCN-NEXT: v_mov_b32_e32 v9, v19 +; GCN-NEXT: v_mov_b32_e32 v10, v20 +; GCN-NEXT: v_mov_b32_e32 v11, v21 +; GCN-NEXT: v_mov_b32_e32 v12, v22 +; GCN-NEXT: v_mov_b32_e32 v13, v23 +; GCN-NEXT: v_mov_b32_e32 v14, v24 +; GCN-NEXT: v_mov_b32_e32 v15, v25 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz @@ -2631,41 +1739,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz @@ -2678,41 +1769,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27] cbsz:3 blgp:3 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 3, ; cbsz @@ -2726,41 +1800,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[10:25], v[0:5], v[6:9], v[10:25], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v10 +; GCN-NEXT: v_mov_b32_e32 v1, v11 +; GCN-NEXT: v_mov_b32_e32 v2, v12 +; GCN-NEXT: v_mov_b32_e32 v3, v13 +; GCN-NEXT: v_mov_b32_e32 v4, v14 +; GCN-NEXT: v_mov_b32_e32 v5, v15 +; GCN-NEXT: v_mov_b32_e32 v6, v16 +; GCN-NEXT: v_mov_b32_e32 v7, v17 +; GCN-NEXT: v_mov_b32_e32 v8, v18 +; GCN-NEXT: v_mov_b32_e32 v9, v19 +; GCN-NEXT: v_mov_b32_e32 v10, v20 +; GCN-NEXT: v_mov_b32_e32 v11, v21 +; GCN-NEXT: v_mov_b32_e32 v12, v22 +; GCN-NEXT: v_mov_b32_e32 v13, v23 +; GCN-NEXT: v_mov_b32_e32 v14, v24 +; GCN-NEXT: v_mov_b32_e32 v15, v25 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz @@ -2773,41 +1830,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[10:25], v[0:5], v[6:9], v[10:25] cbsz:2 blgp:4 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v10 +; GCN-NEXT: v_mov_b32_e32 v1, v11 +; GCN-NEXT: v_mov_b32_e32 v2, v12 +; GCN-NEXT: v_mov_b32_e32 v3, v13 +; GCN-NEXT: v_mov_b32_e32 v4, v14 +; GCN-NEXT: v_mov_b32_e32 v5, v15 +; GCN-NEXT: v_mov_b32_e32 v6, v16 +; GCN-NEXT: v_mov_b32_e32 v7, v17 +; GCN-NEXT: v_mov_b32_e32 v8, v18 +; GCN-NEXT: v_mov_b32_e32 v9, v19 +; GCN-NEXT: v_mov_b32_e32 v10, v20 +; GCN-NEXT: v_mov_b32_e32 v11, v21 +; GCN-NEXT: v_mov_b32_e32 v12, v22 +; GCN-NEXT: v_mov_b32_e32 v13, v23 +; GCN-NEXT: v_mov_b32_e32 v14, v24 +; GCN-NEXT: v_mov_b32_e32 v15, v25 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz @@ -2821,42 +1861,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:3], v[4:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz @@ -2869,42 +1892,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:3], v[4:11], v[12:27] cbsz:4 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz @@ -2918,42 +1924,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:3], v[4:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz @@ -2966,42 +1955,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:3], v[4:11], v[12:27] cbsz:4 blgp:1 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz @@ -3015,41 +1987,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[10:25], v[0:3], v[4:9], v[10:25], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v10 +; GCN-NEXT: v_mov_b32_e32 v1, v11 +; GCN-NEXT: v_mov_b32_e32 v2, v12 +; GCN-NEXT: v_mov_b32_e32 v3, v13 +; GCN-NEXT: v_mov_b32_e32 v4, v14 +; GCN-NEXT: v_mov_b32_e32 v5, v15 +; GCN-NEXT: v_mov_b32_e32 v6, v16 +; GCN-NEXT: v_mov_b32_e32 v7, v17 +; GCN-NEXT: v_mov_b32_e32 v8, v18 +; GCN-NEXT: v_mov_b32_e32 v9, v19 +; GCN-NEXT: v_mov_b32_e32 v10, v20 +; GCN-NEXT: v_mov_b32_e32 v11, v21 +; GCN-NEXT: v_mov_b32_e32 v12, v22 +; GCN-NEXT: v_mov_b32_e32 v13, v23 +; GCN-NEXT: v_mov_b32_e32 v14, v24 +; GCN-NEXT: v_mov_b32_e32 v15, v25 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz @@ -3062,41 +2017,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[10:25], v[0:3], v[4:9], v[10:25] cbsz:4 blgp:2 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v10 +; GCN-NEXT: v_mov_b32_e32 v1, v11 +; GCN-NEXT: v_mov_b32_e32 v2, v12 +; GCN-NEXT: v_mov_b32_e32 v3, v13 +; GCN-NEXT: v_mov_b32_e32 v4, v14 +; GCN-NEXT: v_mov_b32_e32 v5, v15 +; GCN-NEXT: v_mov_b32_e32 v6, v16 +; GCN-NEXT: v_mov_b32_e32 v7, v17 +; GCN-NEXT: v_mov_b32_e32 v8, v18 +; GCN-NEXT: v_mov_b32_e32 v9, v19 +; GCN-NEXT: v_mov_b32_e32 v10, v20 +; GCN-NEXT: v_mov_b32_e32 v11, v21 +; GCN-NEXT: v_mov_b32_e32 v12, v22 +; GCN-NEXT: v_mov_b32_e32 v13, v23 +; GCN-NEXT: v_mov_b32_e32 v14, v24 +; GCN-NEXT: v_mov_b32_e32 v15, v25 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz @@ -3110,41 +2048,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[10:25], v[0:3], v[4:9], v[10:25], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v10 +; GCN-NEXT: v_mov_b32_e32 v1, v11 +; GCN-NEXT: v_mov_b32_e32 v2, v12 +; GCN-NEXT: v_mov_b32_e32 v3, v13 +; GCN-NEXT: v_mov_b32_e32 v4, v14 +; GCN-NEXT: v_mov_b32_e32 v5, v15 +; GCN-NEXT: v_mov_b32_e32 v6, v16 +; GCN-NEXT: v_mov_b32_e32 v7, v17 +; GCN-NEXT: v_mov_b32_e32 v8, v18 +; GCN-NEXT: v_mov_b32_e32 v9, v19 +; GCN-NEXT: v_mov_b32_e32 v10, v20 +; GCN-NEXT: v_mov_b32_e32 v11, v21 +; GCN-NEXT: v_mov_b32_e32 v12, v22 +; GCN-NEXT: v_mov_b32_e32 v13, v23 +; GCN-NEXT: v_mov_b32_e32 v14, v24 +; GCN-NEXT: v_mov_b32_e32 v15, v25 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz @@ -3157,41 +2078,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v10 -; GCN-NEXT: v_accvgpr_write_b32 a1, v11 -; GCN-NEXT: v_accvgpr_write_b32 a2, v12 -; GCN-NEXT: v_accvgpr_write_b32 a3, v13 -; GCN-NEXT: v_accvgpr_write_b32 a4, v14 -; GCN-NEXT: v_accvgpr_write_b32 a5, v15 -; GCN-NEXT: v_accvgpr_write_b32 a6, v16 -; GCN-NEXT: v_accvgpr_write_b32 a7, v17 -; GCN-NEXT: v_accvgpr_write_b32 a8, v18 -; GCN-NEXT: v_accvgpr_write_b32 a9, v19 -; GCN-NEXT: v_accvgpr_write_b32 a10, v20 -; GCN-NEXT: v_accvgpr_write_b32 a11, v21 -; GCN-NEXT: v_accvgpr_write_b32 a12, v22 -; GCN-NEXT: v_accvgpr_write_b32 a13, v23 -; GCN-NEXT: v_accvgpr_write_b32 a14, v24 -; GCN-NEXT: v_accvgpr_write_b32 a15, v25 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[10:25], v[0:3], v[4:9], v[10:25] cbsz:4 blgp:3 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v10 +; GCN-NEXT: v_mov_b32_e32 v1, v11 +; GCN-NEXT: v_mov_b32_e32 v2, v12 +; GCN-NEXT: v_mov_b32_e32 v3, v13 +; GCN-NEXT: v_mov_b32_e32 v4, v14 +; GCN-NEXT: v_mov_b32_e32 v5, v15 +; GCN-NEXT: v_mov_b32_e32 v6, v16 +; GCN-NEXT: v_mov_b32_e32 v7, v17 +; GCN-NEXT: v_mov_b32_e32 v8, v18 +; GCN-NEXT: v_mov_b32_e32 v9, v19 +; GCN-NEXT: v_mov_b32_e32 v10, v20 +; GCN-NEXT: v_mov_b32_e32 v11, v21 +; GCN-NEXT: v_mov_b32_e32 v12, v22 +; GCN-NEXT: v_mov_b32_e32 v13, v23 +; GCN-NEXT: v_mov_b32_e32 v14, v24 +; GCN-NEXT: v_mov_b32_e32 v15, v25 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz @@ -3205,41 +2109,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[0:3], v[4:7], v[8:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v8 +; GCN-NEXT: v_mov_b32_e32 v1, v9 +; GCN-NEXT: v_mov_b32_e32 v2, v10 +; GCN-NEXT: v_mov_b32_e32 v3, v11 +; GCN-NEXT: v_mov_b32_e32 v4, v12 +; GCN-NEXT: v_mov_b32_e32 v5, v13 +; GCN-NEXT: v_mov_b32_e32 v6, v14 +; GCN-NEXT: v_mov_b32_e32 v7, v15 +; GCN-NEXT: v_mov_b32_e32 v8, v16 +; GCN-NEXT: v_mov_b32_e32 v9, v17 +; GCN-NEXT: v_mov_b32_e32 v10, v18 +; GCN-NEXT: v_mov_b32_e32 v11, v19 +; GCN-NEXT: v_mov_b32_e32 v12, v20 +; GCN-NEXT: v_mov_b32_e32 v13, v21 +; GCN-NEXT: v_mov_b32_e32 v14, v22 +; GCN-NEXT: v_mov_b32_e32 v15, v23 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz @@ -3252,41 +2139,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v8 -; GCN-NEXT: v_accvgpr_write_b32 a1, v9 -; GCN-NEXT: v_accvgpr_write_b32 a2, v10 -; GCN-NEXT: v_accvgpr_write_b32 a3, v11 -; GCN-NEXT: v_accvgpr_write_b32 a4, v12 -; GCN-NEXT: v_accvgpr_write_b32 a5, v13 -; GCN-NEXT: v_accvgpr_write_b32 a6, v14 -; GCN-NEXT: v_accvgpr_write_b32 a7, v15 -; GCN-NEXT: v_accvgpr_write_b32 a8, v16 -; GCN-NEXT: v_accvgpr_write_b32 a9, v17 -; GCN-NEXT: v_accvgpr_write_b32 a10, v18 -; GCN-NEXT: v_accvgpr_write_b32 a11, v19 -; GCN-NEXT: v_accvgpr_write_b32 a12, v20 -; GCN-NEXT: v_accvgpr_write_b32 a13, v21 -; GCN-NEXT: v_accvgpr_write_b32 a14, v22 -; GCN-NEXT: v_accvgpr_write_b32 a15, v23 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:4 blgp:4 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v8 +; GCN-NEXT: v_mov_b32_e32 v1, v9 +; GCN-NEXT: v_mov_b32_e32 v2, v10 +; GCN-NEXT: v_mov_b32_e32 v3, v11 +; GCN-NEXT: v_mov_b32_e32 v4, v12 +; GCN-NEXT: v_mov_b32_e32 v5, v13 +; GCN-NEXT: v_mov_b32_e32 v6, v14 +; GCN-NEXT: v_mov_b32_e32 v7, v15 +; GCN-NEXT: v_mov_b32_e32 v8, v16 +; GCN-NEXT: v_mov_b32_e32 v9, v17 +; GCN-NEXT: v_mov_b32_e32 v10, v18 +; GCN-NEXT: v_mov_b32_e32 v11, v19 +; GCN-NEXT: v_mov_b32_e32 v12, v20 +; GCN-NEXT: v_mov_b32_e32 v13, v21 +; GCN-NEXT: v_mov_b32_e32 v14, v22 +; GCN-NEXT: v_mov_b32_e32 v15, v23 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz @@ -3303,45 +2173,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: v_mov_b32_e32 v16, s0 -; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: v_mov_b32_e32 v32, s0 +; GCN-NEXT: v_mov_b32_e32 v33, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: v_mov_b32_e32 v4, v20 +; GCN-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NEXT: v_mov_b32_e32 v6, v22 +; GCN-NEXT: v_mov_b32_e32 v7, v23 +; GCN-NEXT: v_mov_b32_e32 v8, v24 +; GCN-NEXT: v_mov_b32_e32 v9, v25 +; GCN-NEXT: v_mov_b32_e32 v10, v26 +; GCN-NEXT: v_mov_b32_e32 v11, v27 +; GCN-NEXT: v_mov_b32_e32 v12, v28 +; GCN-NEXT: v_mov_b32_e32 v13, v29 +; GCN-NEXT: v_mov_b32_e32 v14, v30 +; GCN-NEXT: v_mov_b32_e32 v15, v31 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result @@ -3351,45 +2206,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GCN-NEXT: v_mov_b32_e32 v33, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v31 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: v_mov_b32_e32 v4, v20 +; GCN-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NEXT: v_mov_b32_e32 v6, v22 +; GCN-NEXT: v_mov_b32_e32 v7, v23 +; GCN-NEXT: v_mov_b32_e32 v8, v24 +; GCN-NEXT: v_mov_b32_e32 v9, v25 +; GCN-NEXT: v_mov_b32_e32 v10, v26 +; GCN-NEXT: v_mov_b32_e32 v11, v27 +; GCN-NEXT: v_mov_b32_e32 v12, v28 +; GCN-NEXT: v_mov_b32_e32 v13, v29 +; GCN-NEXT: v_mov_b32_e32 v14, v30 +; GCN-NEXT: v_mov_b32_e32 v15, v31 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result @@ -3399,45 +2239,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: scratch_load_dword v31, off, s32 +; GCN-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GCN-NEXT: v_mov_b32_e32 v33, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v16 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: v_mov_b32_e32 v4, v20 +; GCN-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NEXT: v_mov_b32_e32 v6, v22 +; GCN-NEXT: v_mov_b32_e32 v7, v23 +; GCN-NEXT: v_mov_b32_e32 v8, v24 +; GCN-NEXT: v_mov_b32_e32 v9, v25 +; GCN-NEXT: v_mov_b32_e32 v10, v26 +; GCN-NEXT: v_mov_b32_e32 v11, v27 +; GCN-NEXT: v_mov_b32_e32 v12, v28 +; GCN-NEXT: v_mov_b32_e32 v13, v29 +; GCN-NEXT: v_mov_b32_e32 v14, v30 +; GCN-NEXT: v_mov_b32_e32 v15, v31 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result @@ -3455,8 +2280,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_mov_b32_e32 v31, s17 ; SDAG-NEXT: v_mov_b32_e32 v32, s18 ; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 +; SDAG-NEXT: v_mov_b32_e32 v16, v15 +; SDAG-NEXT: v_mov_b32_e32 v17, v14 +; SDAG-NEXT: v_mov_b32_e32 v15, v13 +; SDAG-NEXT: v_mov_b32_e32 v14, v12 +; SDAG-NEXT: v_mov_b32_e32 v13, v11 +; SDAG-NEXT: v_mov_b32_e32 v12, v10 +; SDAG-NEXT: v_mov_b32_e32 v11, v9 +; SDAG-NEXT: v_mov_b32_e32 v10, v8 +; SDAG-NEXT: v_mov_b32_e32 v9, v7 +; SDAG-NEXT: v_mov_b32_e32 v8, v6 +; SDAG-NEXT: v_mov_b32_e32 v7, v5 +; SDAG-NEXT: v_mov_b32_e32 v6, v4 +; SDAG-NEXT: v_mov_b32_e32 v5, v3 +; SDAG-NEXT: v_mov_b32_e32 v4, v2 +; SDAG-NEXT: v_mov_b32_e32 v3, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s28 +; SDAG-NEXT: v_mov_b32_e32 v1, s29 ; SDAG-NEXT: v_mov_b32_e32 v18, s20 ; SDAG-NEXT: v_mov_b32_e32 v19, s21 ; SDAG-NEXT: v_mov_b32_e32 v20, s22 @@ -3465,42 +2306,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_mov_b32_e32 v23, s25 ; SDAG-NEXT: v_mov_b32_e32 v24, s26 ; SDAG-NEXT: v_mov_b32_e32 v25, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v2 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v3 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v4 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v5 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v6 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v7 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[26:33], v[18:25], v[0:15], v17, v16 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: @@ -3510,52 +2317,50 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: s_mov_b32 s13, s1 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v32, s28 -; GISEL-NEXT: v_mov_b32_e32 v33, s29 -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[20:21] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v32 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v33 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v2 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v3 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v4 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v5 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v6 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v7 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[26:27] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v28, v10 +; GISEL-NEXT: v_mov_b32_e32 v29, v11 +; GISEL-NEXT: v_mov_b32_e32 v30, v12 +; GISEL-NEXT: v_mov_b32_e32 v31, v13 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[20:21] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[32:39], v[16:31], v14, v15 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result @@ -3573,43 +2378,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_mov_b32_e32 v31, s17 ; SDAG-NEXT: v_mov_b32_e32 v32, s18 ; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v25, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[26:33], v[0:7], v[8:23], v25, v24 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v8 +; SDAG-NEXT: v_mov_b32_e32 v1, v9 +; SDAG-NEXT: v_mov_b32_e32 v2, v10 +; SDAG-NEXT: v_mov_b32_e32 v3, v11 +; SDAG-NEXT: v_mov_b32_e32 v4, v12 +; SDAG-NEXT: v_mov_b32_e32 v5, v13 +; SDAG-NEXT: v_mov_b32_e32 v6, v14 +; SDAG-NEXT: v_mov_b32_e32 v7, v15 +; SDAG-NEXT: v_mov_b32_e32 v8, v16 +; SDAG-NEXT: v_mov_b32_e32 v9, v17 +; SDAG-NEXT: v_mov_b32_e32 v10, v18 +; SDAG-NEXT: v_mov_b32_e32 v11, v19 +; SDAG-NEXT: v_mov_b32_e32 v12, v20 +; SDAG-NEXT: v_mov_b32_e32 v13, v21 +; SDAG-NEXT: v_mov_b32_e32 v14, v22 +; SDAG-NEXT: v_mov_b32_e32 v15, v23 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr: @@ -3623,43 +2412,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 -; GISEL-NEXT: v_mov_b32_e32 v8, s20 +; GISEL-NEXT: v_mov_b32_e32 v25, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[26:33], v[0:7], v[8:23], v25, v24 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mov_b32_e32 v3, v11 +; GISEL-NEXT: v_mov_b32_e32 v4, v12 +; GISEL-NEXT: v_mov_b32_e32 v5, v13 +; GISEL-NEXT: v_mov_b32_e32 v6, v14 +; GISEL-NEXT: v_mov_b32_e32 v7, v15 +; GISEL-NEXT: v_mov_b32_e32 v8, v16 +; GISEL-NEXT: v_mov_b32_e32 v9, v17 +; GISEL-NEXT: v_mov_b32_e32 v10, v18 +; GISEL-NEXT: v_mov_b32_e32 v11, v19 +; GISEL-NEXT: v_mov_b32_e32 v12, v20 +; GISEL-NEXT: v_mov_b32_e32 v13, v21 +; GISEL-NEXT: v_mov_b32_e32 v14, v22 +; GISEL-NEXT: v_mov_b32_e32 v15, v23 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result @@ -3677,43 +2450,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v31, s17 ; SDAG-NEXT: v_mov_b32_e32 v32, s18 ; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v25, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[26:33], v[0:7], v[8:23], v24, v25 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v8 +; SDAG-NEXT: v_mov_b32_e32 v1, v9 +; SDAG-NEXT: v_mov_b32_e32 v2, v10 +; SDAG-NEXT: v_mov_b32_e32 v3, v11 +; SDAG-NEXT: v_mov_b32_e32 v4, v12 +; SDAG-NEXT: v_mov_b32_e32 v5, v13 +; SDAG-NEXT: v_mov_b32_e32 v6, v14 +; SDAG-NEXT: v_mov_b32_e32 v7, v15 +; SDAG-NEXT: v_mov_b32_e32 v8, v16 +; SDAG-NEXT: v_mov_b32_e32 v9, v17 +; SDAG-NEXT: v_mov_b32_e32 v10, v18 +; SDAG-NEXT: v_mov_b32_e32 v11, v19 +; SDAG-NEXT: v_mov_b32_e32 v12, v20 +; SDAG-NEXT: v_mov_b32_e32 v13, v21 +; SDAG-NEXT: v_mov_b32_e32 v14, v22 +; SDAG-NEXT: v_mov_b32_e32 v15, v23 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr: @@ -3727,43 +2484,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 -; GISEL-NEXT: v_mov_b32_e32 v8, s20 +; GISEL-NEXT: v_mov_b32_e32 v25, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[26:33], v[0:7], v[8:23], v24, v25 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mov_b32_e32 v3, v11 +; GISEL-NEXT: v_mov_b32_e32 v4, v12 +; GISEL-NEXT: v_mov_b32_e32 v5, v13 +; GISEL-NEXT: v_mov_b32_e32 v6, v14 +; GISEL-NEXT: v_mov_b32_e32 v7, v15 +; GISEL-NEXT: v_mov_b32_e32 v8, v16 +; GISEL-NEXT: v_mov_b32_e32 v9, v17 +; GISEL-NEXT: v_mov_b32_e32 v10, v18 +; GISEL-NEXT: v_mov_b32_e32 v11, v19 +; GISEL-NEXT: v_mov_b32_e32 v12, v20 +; GISEL-NEXT: v_mov_b32_e32 v13, v21 +; GISEL-NEXT: v_mov_b32_e32 v14, v22 +; GISEL-NEXT: v_mov_b32_e32 v15, v23 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result @@ -3781,43 +2522,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v31, s17 ; SDAG-NEXT: v_mov_b32_e32 v32, s18 ; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v25, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[0:7], v[26:33], v[8:23], v24, v25 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v8 +; SDAG-NEXT: v_mov_b32_e32 v1, v9 +; SDAG-NEXT: v_mov_b32_e32 v2, v10 +; SDAG-NEXT: v_mov_b32_e32 v3, v11 +; SDAG-NEXT: v_mov_b32_e32 v4, v12 +; SDAG-NEXT: v_mov_b32_e32 v5, v13 +; SDAG-NEXT: v_mov_b32_e32 v6, v14 +; SDAG-NEXT: v_mov_b32_e32 v7, v15 +; SDAG-NEXT: v_mov_b32_e32 v8, v16 +; SDAG-NEXT: v_mov_b32_e32 v9, v17 +; SDAG-NEXT: v_mov_b32_e32 v10, v18 +; SDAG-NEXT: v_mov_b32_e32 v11, v19 +; SDAG-NEXT: v_mov_b32_e32 v12, v20 +; SDAG-NEXT: v_mov_b32_e32 v13, v21 +; SDAG-NEXT: v_mov_b32_e32 v14, v22 +; SDAG-NEXT: v_mov_b32_e32 v15, v23 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: @@ -3831,90 +2556,108 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 -; GISEL-NEXT: v_mov_b32_e32 v8, s20 +; GISEL-NEXT: v_mov_b32_e32 v25, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[0:7], v[26:33], v[8:23], v24, v25 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mov_b32_e32 v3, v11 +; GISEL-NEXT: v_mov_b32_e32 v4, v12 +; GISEL-NEXT: v_mov_b32_e32 v5, v13 +; GISEL-NEXT: v_mov_b32_e32 v6, v14 +; GISEL-NEXT: v_mov_b32_e32 v7, v15 +; GISEL-NEXT: v_mov_b32_e32 v8, v16 +; GISEL-NEXT: v_mov_b32_e32 v9, v17 +; GISEL-NEXT: v_mov_b32_e32 v10, v18 +; GISEL-NEXT: v_mov_b32_e32 v11, v19 +; GISEL-NEXT: v_mov_b32_e32 v12, v20 +; GISEL-NEXT: v_mov_b32_e32 v13, v21 +; GISEL-NEXT: v_mov_b32_e32 v14, v22 +; GISEL-NEXT: v_mov_b32_e32 v15, v23 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_accvgpr_write_b32 a4, s16 -; GCN-NEXT: v_accvgpr_write_b32 a5, s17 -; GCN-NEXT: v_accvgpr_write_b32 a6, s18 -; GCN-NEXT: v_accvgpr_write_b32 a7, s19 -; GCN-NEXT: v_accvgpr_write_b32 a8, s20 -; GCN-NEXT: v_accvgpr_write_b32 a9, s21 -; GCN-NEXT: v_accvgpr_write_b32 a10, s22 -; GCN-NEXT: v_accvgpr_write_b32 a11, s23 -; GCN-NEXT: v_accvgpr_write_b32 a12, s24 -; GCN-NEXT: v_accvgpr_write_b32 a13, s25 -; GCN-NEXT: v_accvgpr_write_b32 a14, s26 -; GCN-NEXT: v_accvgpr_write_b32 a15, s27 -; GCN-NEXT: v_mov_b32_e32 v17, s28 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 15 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v33, v7 +; SDAG-NEXT: v_mov_b32_e32 v32, v6 +; SDAG-NEXT: v_mov_b32_e32 v31, v5 +; SDAG-NEXT: v_mov_b32_e32 v30, v4 +; SDAG-NEXT: v_mov_b32_e32 v29, v3 +; SDAG-NEXT: v_mov_b32_e32 v28, v2 +; SDAG-NEXT: v_mov_b32_e32 v27, v1 +; SDAG-NEXT: v_mov_b32_e32 v26, v0 +; SDAG-NEXT: v_mov_b32_e32 v25, v15 +; SDAG-NEXT: v_mov_b32_e32 v24, v14 +; SDAG-NEXT: v_mov_b32_e32 v23, v13 +; SDAG-NEXT: v_mov_b32_e32 v22, v12 +; SDAG-NEXT: v_mov_b32_e32 v21, v11 +; SDAG-NEXT: v_mov_b32_e32 v20, v10 +; SDAG-NEXT: v_mov_b32_e32 v19, v9 +; SDAG-NEXT: v_mov_b32_e32 v18, v8 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-NEXT: v_mov_b32_e32 v3, s3 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v17, s28 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[26:33], v[18:25], v[0:15], v16, v17 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v20, v2 +; GISEL-NEXT: v_mov_b32_e32 v21, v3 +; GISEL-NEXT: v_mov_b32_e32 v22, v4 +; GISEL-NEXT: v_mov_b32_e32 v23, v5 +; GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GISEL-NEXT: v_mov_b32_e32 v25, v7 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s1 +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v28, v10 +; GISEL-NEXT: v_mov_b32_e32 v29, v11 +; GISEL-NEXT: v_mov_b32_e32 v30, v12 +; GISEL-NEXT: v_mov_b32_e32 v31, v13 +; GISEL-NEXT: v_mov_b32_e32 v32, v14 +; GISEL-NEXT: v_mov_b32_e32 v33, v15 +; GISEL-NEXT: s_mov_b32 s14, s2 +; GISEL-NEXT: s_mov_b32 s15, s3 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[26:27] +; GISEL-NEXT: v_mov_b32_e32 v17, s28 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -3931,52 +2674,34 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v31, s17 ; SDAG-NEXT: v_mov_b32_e32 v32, s18 ; SDAG-NEXT: v_mov_b32_e32 v33, s19 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b32_e32 v20, s24 -; SDAG-NEXT: v_mov_b32_e32 v21, s25 -; SDAG-NEXT: v_mov_b32_e32 v22, s26 -; SDAG-NEXT: v_mov_b32_e32 v23, s27 -; SDAG-NEXT: v_mov_b32_e32 v24, s28 -; SDAG-NEXT: v_mov_b32_e32 v25, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 +; SDAG-NEXT: v_mov_b32_e32 v16, v15 +; SDAG-NEXT: v_mov_b32_e32 v17, v14 +; SDAG-NEXT: v_mov_b32_e32 v15, v13 +; SDAG-NEXT: v_mov_b32_e32 v14, v12 +; SDAG-NEXT: v_mov_b32_e32 v13, v11 +; SDAG-NEXT: v_mov_b32_e32 v12, v10 +; SDAG-NEXT: v_mov_b32_e32 v11, v9 +; SDAG-NEXT: v_mov_b32_e32 v10, v8 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v4, s24 +; SDAG-NEXT: v_mov_b32_e32 v5, s25 +; SDAG-NEXT: v_mov_b32_e32 v6, s26 +; SDAG-NEXT: v_mov_b32_e32 v7, s27 +; SDAG-NEXT: v_mov_b32_e32 v8, s28 +; SDAG-NEXT: v_mov_b32_e32 v9, s29 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 15 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[26:33], v[18:25], v[0:15], v17, v16 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: @@ -3986,56 +2711,46 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: s_mov_b32 s13, s1 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v24, s20 -; GISEL-NEXT: v_mov_b32_e32 v25, s21 -; GISEL-NEXT: v_mov_b32_e32 v26, s22 -; GISEL-NEXT: v_mov_b32_e32 v27, s23 -; GISEL-NEXT: v_mov_b32_e32 v28, s24 -; GISEL-NEXT: v_mov_b32_e32 v29, s25 -; GISEL-NEXT: v_mov_b32_e32 v30, s26 -; GISEL-NEXT: v_mov_b32_e32 v31, s27 -; GISEL-NEXT: v_mov_b32_e32 v32, s28 -; GISEL-NEXT: v_mov_b32_e32 v33, s29 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v30 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v31 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v32 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v33 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v26, v8 +; GISEL-NEXT: v_mov_b32_e32 v27, v9 +; GISEL-NEXT: v_mov_b32_e32 v28, v10 +; GISEL-NEXT: v_mov_b32_e32 v29, v11 +; GISEL-NEXT: v_mov_b32_e32 v30, v12 +; GISEL-NEXT: v_mov_b32_e32 v31, v13 +; GISEL-NEXT: v_mov_b32_e32 v16, s20 +; GISEL-NEXT: v_mov_b32_e32 v17, s21 +; GISEL-NEXT: v_mov_b32_e32 v18, s22 +; GISEL-NEXT: v_mov_b32_e32 v19, s23 +; GISEL-NEXT: v_mov_b32_e32 v20, s24 +; GISEL-NEXT: v_mov_b32_e32 v21, s25 +; GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GISEL-NEXT: v_mov_b32_e32 v23, s27 +; GISEL-NEXT: v_mov_b32_e32 v24, s28 +; GISEL-NEXT: v_mov_b32_e32 v25, s29 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[32:39], v[0:7], v[16:31], v14, v15 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result @@ -4045,89 +2760,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__ ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, -2 -; SDAG-NEXT: v_mov_b32_e32 v32, 33 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v32, -2 +; SDAG-NEXT: v_mov_b32_e32 v33, 33 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 33 -; GISEL-NEXT: v_mov_b32_e32 v32, -2 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v32, 33 +; GISEL-NEXT: v_mov_b32_e32 v33, -2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) ret <16 x float> %result @@ -4137,89 +2822,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, -2 -; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v32, -2 +; SDAG-NEXT: v_mov_b32_e32 v33, 0x41 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v32, -2 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v32, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v33, -2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) ret <16 x float> %result @@ -4229,89 +2884,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, 1.0 -; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v32, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v33, 0x41 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v32, 1.0 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v32, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v33, 1.0 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216) ret <16 x float> %result @@ -4321,89 +2946,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_ ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, -2 -; SDAG-NEXT: v_mov_b32_e32 v32, 1.0 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v32, -2 +; SDAG-NEXT: v_mov_b32_e32 v33, 1.0 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 1.0 -; GISEL-NEXT: v_mov_b32_e32 v32, -2 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v32, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v33, -2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2) ret <16 x float> %result @@ -4413,89 +3008,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_ ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, 1.0 -; SDAG-NEXT: v_mov_b32_e32 v32, 0.15915494 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v32, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v33, 0.15915494 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 0.15915494 -; GISEL-NEXT: v_mov_b32_e32 v32, 1.0 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v32, 0.15915494 +; GISEL-NEXT: v_mov_b32_e32 v33, 1.0 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216) ret <16 x float> %result @@ -4505,89 +3070,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d -; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v32, 0x4d +; SDAG-NEXT: v_mov_b32_e32 v33, 0x41 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v32, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v33, 0x4d ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) ret <16 x float> %result @@ -4764,80 +3299,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s16 -; SDAG-NEXT: v_mov_b32_e32 v7, s17 -; SDAG-NEXT: v_mov_b32_e32 v8, s18 -; SDAG-NEXT: v_mov_b32_e32 v9, s19 -; SDAG-NEXT: v_mov_b32_e32 v10, s20 -; SDAG-NEXT: v_mov_b32_e32 v11, s21 -; SDAG-NEXT: v_mov_b32_e32 v12, s22 -; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s16 +; SDAG-NEXT: v_mov_b32_e32 v23, s17 +; SDAG-NEXT: v_mov_b32_e32 v24, s18 +; SDAG-NEXT: v_mov_b32_e32 v25, s19 +; SDAG-NEXT: v_mov_b32_e32 v26, s20 +; SDAG-NEXT: v_mov_b32_e32 v27, s21 +; SDAG-NEXT: v_mov_b32_e32 v28, s22 +; SDAG-NEXT: v_mov_b32_e32 v29, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 -; SDAG-NEXT: v_mov_b32_e32 v14, s24 -; SDAG-NEXT: v_mov_b32_e32 v15, s25 -; SDAG-NEXT: v_mov_b32_e32 v16, s26 +; SDAG-NEXT: v_mov_b32_e32 v30, s24 +; SDAG-NEXT: v_mov_b32_e32 v31, s25 +; SDAG-NEXT: v_mov_b32_e32 v32, s26 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 -; SDAG-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b32_e32 v33, s27 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: v_mov_b32_e32 v17, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel_hi:[0,0,0] -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v16, v17 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mov_b32_e32 v18, s20 +; SDAG-NEXT: v_mov_b32_e32 v19, s21 +; SDAG-NEXT: v_mov_b32_e32 v20, s22 +; SDAG-NEXT: v_mov_b32_e32 v21, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], 48 +; SDAG-NEXT: global_store_dwordx4 v[16:17], v[18:21], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], 32 +; SDAG-NEXT: global_store_dwordx4 v[18:19], v[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v24, s14 +; SDAG-NEXT: v_mov_b32_e32 v25, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s12 +; SDAG-NEXT: v_mov_b32_e32 v23, s13 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 16 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[22:25], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 0 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[18:19], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[16:17], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -4846,63 +3373,55 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b32_e32 v20, s0 -; GISEL-NEXT: v_mov_b32_e32 v21, s1 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b32_e32 v36, s0 +; GISEL-NEXT: v_mov_b32_e32 v37, s1 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v36, v37 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) @@ -4915,80 +3434,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v0, 42 -; SDAG-NEXT: v_mov_b32_e32 v1, 25 +; SDAG-NEXT: v_mov_b32_e32 v16, 42 +; SDAG-NEXT: v_mov_b32_e32 v17, 25 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s16 -; SDAG-NEXT: v_mov_b32_e32 v7, s17 -; SDAG-NEXT: v_mov_b32_e32 v8, s18 -; SDAG-NEXT: v_mov_b32_e32 v9, s19 -; SDAG-NEXT: v_mov_b32_e32 v10, s20 -; SDAG-NEXT: v_mov_b32_e32 v11, s21 -; SDAG-NEXT: v_mov_b32_e32 v12, s22 -; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s16 +; SDAG-NEXT: v_mov_b32_e32 v23, s17 +; SDAG-NEXT: v_mov_b32_e32 v24, s18 +; SDAG-NEXT: v_mov_b32_e32 v25, s19 +; SDAG-NEXT: v_mov_b32_e32 v26, s20 +; SDAG-NEXT: v_mov_b32_e32 v27, s21 +; SDAG-NEXT: v_mov_b32_e32 v28, s22 +; SDAG-NEXT: v_mov_b32_e32 v29, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v14, s24 -; SDAG-NEXT: v_mov_b32_e32 v15, s25 -; SDAG-NEXT: v_mov_b32_e32 v16, s26 -; SDAG-NEXT: v_mov_b32_e32 v17, s27 +; SDAG-NEXT: v_mov_b32_e32 v30, s24 +; SDAG-NEXT: v_mov_b32_e32 v31, s25 +; SDAG-NEXT: v_mov_b32_e32 v32, s26 +; SDAG-NEXT: v_mov_b32_e32 v33, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v1, v0 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v18, s20 +; SDAG-NEXT: v_mov_b32_e32 v19, s21 +; SDAG-NEXT: v_mov_b32_e32 v20, s22 +; SDAG-NEXT: v_mov_b32_e32 v21, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], 48 +; SDAG-NEXT: global_store_dwordx4 v[16:17], v[18:21], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], 32 +; SDAG-NEXT: global_store_dwordx4 v[18:19], v[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v24, s14 +; SDAG-NEXT: v_mov_b32_e32 v25, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s12 +; SDAG-NEXT: v_mov_b32_e32 v23, s13 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 16 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[22:25], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 0 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[18:19], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[16:17], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -4996,62 +3507,54 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v20, 25 -; GISEL-NEXT: v_mov_b32_e32 v21, 42 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 +; GISEL-NEXT: v_mov_b32_e32 v36, 25 +; GISEL-NEXT: v_mov_b32_e32 v37, 42 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v36, v37 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) @@ -5322,43 +3825,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a( ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: scratch_load_dword v31, off, s32 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: v_mov_b32_e32 v4, v20 +; GCN-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NEXT: v_mov_b32_e32 v6, v22 +; GCN-NEXT: v_mov_b32_e32 v7, v23 +; GCN-NEXT: v_mov_b32_e32 v8, v24 +; GCN-NEXT: v_mov_b32_e32 v9, v25 +; GCN-NEXT: v_mov_b32_e32 v10, v26 +; GCN-NEXT: v_mov_b32_e32 v11, v27 +; GCN-NEXT: v_mov_b32_e32 v12, v28 +; GCN-NEXT: v_mov_b32_e32 v13, v29 +; GCN-NEXT: v_mov_b32_e32 v14, v30 +; GCN-NEXT: v_mov_b32_e32 v15, v31 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) ret <16 x float> %result @@ -5368,43 +3855,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b( ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: scratch_load_dword v31, off, s32 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: v_mov_b32_e32 v4, v20 +; GCN-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NEXT: v_mov_b32_e32 v6, v22 +; GCN-NEXT: v_mov_b32_e32 v7, v23 +; GCN-NEXT: v_mov_b32_e32 v8, v24 +; GCN-NEXT: v_mov_b32_e32 v9, v25 +; GCN-NEXT: v_mov_b32_e32 v10, v26 +; GCN-NEXT: v_mov_b32_e32 v11, v27 +; GCN-NEXT: v_mov_b32_e32 v12, v28 +; GCN-NEXT: v_mov_b32_e32 v13, v29 +; GCN-NEXT: v_mov_b32_e32 v14, v30 +; GCN-NEXT: v_mov_b32_e32 v15, v31 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) ret <16 x float> %result @@ -5414,89 +3885,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, 1 -; SDAG-NEXT: v_mov_b32_e32 v32, 0 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v32, 1 +; SDAG-NEXT: v_mov_b32_e32 v33, 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 0 -; GISEL-NEXT: v_mov_b32_e32 v32, 1 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v32, 0 +; GISEL-NEXT: v_mov_b32_e32 v33, 1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) ret <16 x float> %result @@ -5506,89 +3947,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a( ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: v_mov_b32_e32 v31, 0 -; SDAG-NEXT: v_mov_b32_e32 v32, 1 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v32, 0 +; SDAG-NEXT: v_mov_b32_e32 v33, 1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 1 -; GISEL-NEXT: v_mov_b32_e32 v32, 0 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v32, 1 +; GISEL-NEXT: v_mov_b32_e32 v33, 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) ret <16 x float> %result @@ -5602,89 +4013,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] blgp:2 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -5697,89 +4076,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:2 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:2 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz @@ -5792,87 +4139,55 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:2 blgp:2 ; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz @@ -5885,42 +4200,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_ ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: scratch_load_dword v31, off, s32 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] cbsz:2 blgp:2 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: v_mov_b32_e32 v4, v20 +; GCN-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NEXT: v_mov_b32_e32 v6, v22 +; GCN-NEXT: v_mov_b32_e32 v7, v23 +; GCN-NEXT: v_mov_b32_e32 v8, v24 +; GCN-NEXT: v_mov_b32_e32 v9, v25 +; GCN-NEXT: v_mov_b32_e32 v10, v26 +; GCN-NEXT: v_mov_b32_e32 v11, v27 +; GCN-NEXT: v_mov_b32_e32 v12, v28 +; GCN-NEXT: v_mov_b32_e32 v13, v29 +; GCN-NEXT: v_mov_b32_e32 v14, v30 +; GCN-NEXT: v_mov_b32_e32 v15, v31 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 2, ; cbsz @@ -5933,89 +4232,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] blgp:4 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] blgp:4 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -6028,89 +4295,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:4 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:4 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz @@ -6124,43 +4359,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 0, ; cbsz @@ -6174,43 +4392,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8( ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: scratch_load_dword v31, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v14 -; GCN-NEXT: v_accvgpr_write_b32 a1, v15 -; GCN-NEXT: v_accvgpr_write_b32 a2, v16 -; GCN-NEXT: v_accvgpr_write_b32 a3, v17 -; GCN-NEXT: v_accvgpr_write_b32 a4, v18 -; GCN-NEXT: v_accvgpr_write_b32 a5, v19 -; GCN-NEXT: v_accvgpr_write_b32 a6, v20 -; GCN-NEXT: v_accvgpr_write_b32 a7, v21 -; GCN-NEXT: v_accvgpr_write_b32 a8, v22 -; GCN-NEXT: v_accvgpr_write_b32 a9, v23 -; GCN-NEXT: v_accvgpr_write_b32 a10, v24 -; GCN-NEXT: v_accvgpr_write_b32 a11, v25 -; GCN-NEXT: v_accvgpr_write_b32 a12, v26 -; GCN-NEXT: v_accvgpr_write_b32 a13, v27 -; GCN-NEXT: v_accvgpr_write_b32 a14, v28 -; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v14 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v18 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v20 +; GCN-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v22 +; GCN-NEXT: v_mov_b32_e32 v9, v23 +; GCN-NEXT: v_mov_b32_e32 v10, v24 +; GCN-NEXT: v_mov_b32_e32 v11, v25 +; GCN-NEXT: v_mov_b32_e32 v12, v26 +; GCN-NEXT: v_mov_b32_e32 v13, v27 +; GCN-NEXT: v_mov_b32_e32 v14, v28 +; GCN-NEXT: v_mov_b32_e32 v15, v29 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz @@ -6223,87 +4424,55 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 -; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: scratch_load_dword v31, off, s32 +; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v16 +; SDAG-NEXT: v_mov_b32_e32 v1, v17 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v20 +; SDAG-NEXT: v_mov_b32_e32 v5, v21 +; SDAG-NEXT: v_mov_b32_e32 v6, v22 +; SDAG-NEXT: v_mov_b32_e32 v7, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, v24 +; SDAG-NEXT: v_mov_b32_e32 v9, v25 +; SDAG-NEXT: v_mov_b32_e32 v10, v26 +; SDAG-NEXT: v_mov_b32_e32 v11, v27 +; SDAG-NEXT: v_mov_b32_e32 v12, v28 +; SDAG-NEXT: v_mov_b32_e32 v13, v29 +; SDAG-NEXT: v_mov_b32_e32 v14, v30 +; SDAG-NEXT: v_mov_b32_e32 v15, v31 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: scratch_load_dword v31, off, s32 +; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; GISEL-NEXT: s_nop 11 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_mov_b32_e32 v2, v18 +; GISEL-NEXT: v_mov_b32_e32 v3, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v20 +; GISEL-NEXT: v_mov_b32_e32 v5, v21 +; GISEL-NEXT: v_mov_b32_e32 v6, v22 +; GISEL-NEXT: v_mov_b32_e32 v7, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, v24 +; GISEL-NEXT: v_mov_b32_e32 v9, v25 +; GISEL-NEXT: v_mov_b32_e32 v10, v26 +; GISEL-NEXT: v_mov_b32_e32 v11, v27 +; GISEL-NEXT: v_mov_b32_e32 v12, v28 +; GISEL-NEXT: v_mov_b32_e32 v13, v29 +; GISEL-NEXT: v_mov_b32_e32 v14, v30 +; GISEL-NEXT: v_mov_b32_e32 v15, v31 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz @@ -6316,42 +4485,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_ ; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: scratch_load_dword v31, off, s32 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4 +; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] cbsz:4 blgp:4 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v18 +; GCN-NEXT: v_mov_b32_e32 v3, v19 +; GCN-NEXT: v_mov_b32_e32 v4, v20 +; GCN-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NEXT: v_mov_b32_e32 v6, v22 +; GCN-NEXT: v_mov_b32_e32 v7, v23 +; GCN-NEXT: v_mov_b32_e32 v8, v24 +; GCN-NEXT: v_mov_b32_e32 v9, v25 +; GCN-NEXT: v_mov_b32_e32 v10, v26 +; GCN-NEXT: v_mov_b32_e32 v11, v27 +; GCN-NEXT: v_mov_b32_e32 v12, v28 +; GCN-NEXT: v_mov_b32_e32 v13, v29 +; GCN-NEXT: v_mov_b32_e32 v14, v30 +; GCN-NEXT: v_mov_b32_e32 v15, v31 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 4, ; cbsz diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll index 5475fa2..63466f8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll @@ -9,22 +9,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 { ; GFX942-SDAG-LABEL: test_mfma_f32_16x16x8xf32: ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0x40400000 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, 1.0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 2.0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0x40400000 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 4.0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[4:5], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[8:9], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] +; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32: @@ -32,22 +30,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 { ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0 ; GFX942-GISEL-NEXT: s_mov_b32 s5, 2.0 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x40400000 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s5, 4.0 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -107,37 +103,29 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 { ; GFX942-SDAG-LABEL: test_mfma_f32_32x32x4xf32: ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 1.0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 2.0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0x40400000 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 1.0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 2.0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0x40400000 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 4.0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-SDAG-NEXT: s_nop 9 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_32x32x4xf32: @@ -145,37 +133,29 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 { ; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GFX942-GISEL-NEXT: s_mov_b32 s18, 1.0 ; GFX942-GISEL-NEXT: s_mov_b32 s19, 2.0 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[18:19] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19] ; GFX942-GISEL-NEXT: s_mov_b32 s18, 0x40400000 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s19, 4.0 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-GISEL-NEXT: s_nop 9 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-GISEL-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll index bc72687..9436b49 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll @@ -9,126 +9,127 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 -; GCN-MINREG-NEXT: v_mov_b32_e32 v2, 1.0 -; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v34, 1.0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v33, 2.0 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_add_u32_e32 v4, s0, v0 -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:112 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:96 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:80 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:64 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:16 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:32 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:48 +; GCN-MINREG-NEXT: v_add_u32_e32 v36, s0, v0 +; GCN-MINREG-NEXT: v_add_u32_e32 v37, s1, v0 +; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v36 offset:112 +; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v36 offset:96 +; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v36 offset:80 +; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v36 offset:64 +; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v36 +; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v36 offset:16 +; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v36 offset:32 +; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v36 offset:48 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] -; GCN-MINREG-NEXT: v_add_u32_e32 v5, s1, v0 -; GCN-MINREG-NEXT: v_mov_b32_e32 v0, s1 -; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4 +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v33, v[0:31] +; GCN-MINREG-NEXT: v_mov_b32_e32 v32, s1 +; GCN-MINREG-NEXT: v_add_u32_e32 v35, 0x6000, v36 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[24:27] offset:96 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[20:23] offset:80 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[16:19] offset:64 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[12:15] offset:48 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[8:11] offset:32 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[4:7] offset:16 -; GCN-MINREG-NEXT: ds_write_b128 v5, a[0:3] -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:8304 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:8288 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:8272 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:8256 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:8240 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:8224 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:8208 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:8192 +; GCN-MINREG-NEXT: s_nop 0 +; GCN-MINREG-NEXT: ds_write_b128 v37, v[28:31] offset:112 +; GCN-MINREG-NEXT: ds_write_b128 v37, v[24:27] offset:96 +; GCN-MINREG-NEXT: ds_write_b128 v37, v[20:23] offset:80 +; GCN-MINREG-NEXT: ds_write_b128 v37, v[16:19] offset:64 +; GCN-MINREG-NEXT: ds_write_b128 v37, v[12:15] offset:48 +; GCN-MINREG-NEXT: ds_write_b128 v37, v[8:11] offset:32 +; GCN-MINREG-NEXT: ds_write_b128 v37, v[4:7] offset:16 +; GCN-MINREG-NEXT: ds_write_b128 v37, v[0:3] +; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v36 offset:8304 +; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v36 offset:8288 +; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v36 offset:8272 +; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v36 offset:8256 +; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v36 offset:8240 +; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v36 offset:8224 +; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v36 offset:8208 +; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v36 offset:8192 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v33, v[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:24688 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:24672 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:24656 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:24640 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:24624 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:24608 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:24592 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:24576 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[24:27] offset:8288 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[28:31] offset:8304 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[16:19] offset:8256 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[20:23] offset:8272 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[8:11] offset:8224 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[12:15] offset:8240 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[0:3] offset:8192 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[4:7] offset:8208 +; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v36 offset:24688 +; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v36 offset:24672 +; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v36 offset:24656 +; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v36 offset:24640 +; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v36 offset:24624 +; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v36 offset:24608 +; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v36 offset:24592 +; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v36 offset:24576 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v33, v[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:16480 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:16496 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:16448 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:16464 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:16416 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:16432 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:16384 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:16400 -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:49264 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:49248 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:49232 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:49216 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:49200 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:49184 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:49168 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:49152 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[24:27] offset:16480 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[28:31] offset:16496 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[16:19] offset:16448 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[20:23] offset:16464 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[8:11] offset:16416 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[12:15] offset:16432 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[0:3] offset:16384 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[4:7] offset:16400 +; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v36 offset:49264 +; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v36 offset:49248 +; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v36 offset:49232 +; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v36 offset:49216 +; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v36 offset:49200 +; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v36 offset:49184 +; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v36 offset:49168 +; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v36 offset:49152 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v33, v[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:24672 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:24688 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:24640 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:24656 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:24608 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:24624 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:24576 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:24592 -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:57456 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:57440 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:57424 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:57408 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:57344 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:57360 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:57376 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:57392 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[24:27] offset:24672 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[28:31] offset:24688 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[16:19] offset:24640 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[20:23] offset:24656 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[8:11] offset:24608 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[12:15] offset:24624 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[0:3] offset:24576 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[4:7] offset:24592 +; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v35 offset:57456 +; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v35 offset:57440 +; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v35 offset:57424 +; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v35 offset:57408 +; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v35 offset:57344 +; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v35 offset:57360 +; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v35 offset:57376 +; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v35 offset:57392 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v33, v[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[24:27] offset:32864 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[28:31] offset:32880 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[16:19] offset:32832 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[20:23] offset:32848 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[8:11] offset:32800 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[12:15] offset:32816 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[0:3] offset:32768 +; GCN-MINREG-NEXT: ds_write_b128 v32, v[4:7] offset:32784 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: s_endpgm ; @@ -136,128 +137,128 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MAXOCC: ; %bb.0: ; %entry ; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-MAXOCC-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 -; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0 -; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v33, 0x1ff80, v0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v34, 1.0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v35, 2.0 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v1 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; GCN-MAXOCC-NEXT: v_add_u32_e32 v32, s0, v33 +; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:112 +; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:96 +; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:80 +; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:64 +; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 +; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:16 +; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:32 +; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:48 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-MAXOCC-NEXT: v_add_u32_e32 v1, s1, v1 +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v33, s1, v33 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 1 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:112 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:96 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:80 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:64 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:48 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[28:31] offset:112 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[24:27] offset:96 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[20:23] offset:80 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[16:19] offset:64 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[12:15] offset:48 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[8:11] offset:32 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[4:7] offset:16 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[0:3] +; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:8304 +; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:8288 +; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:8272 +; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:8256 +; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:8240 +; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:8224 +; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:8192 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, s1 +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31] +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v33, s1 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 1 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:8288 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:8304 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:8256 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:8272 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:8224 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:8240 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:8192 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:8208 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[24:27] offset:8288 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[28:31] offset:8304 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[16:19] offset:8256 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[20:23] offset:8272 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[8:11] offset:8224 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[12:15] offset:8240 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[0:3] offset:8192 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[4:7] offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:24688 +; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:24672 +; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:24656 +; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:24640 +; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:24624 +; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:24608 +; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:24576 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31] ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 2 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:16480 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:16496 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:16448 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:16464 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:16416 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:16432 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:16384 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16400 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[24:27] offset:16480 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[28:31] offset:16496 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[16:19] offset:16448 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[20:23] offset:16464 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[8:11] offset:16416 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[12:15] offset:16432 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[0:3] offset:16384 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[4:7] offset:16400 +; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:49264 +; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:49248 +; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:49232 +; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:49216 +; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:49200 +; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:49184 +; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:49168 +; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:49152 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0 +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v32, 0x6000, v32 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 1 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:24672 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:24688 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:24640 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:24656 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:24608 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:24624 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:24576 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:24592 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[24:27] offset:24672 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[28:31] offset:24688 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[16:19] offset:24640 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[20:23] offset:24656 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[8:11] offset:24608 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[12:15] offset:24624 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[0:3] offset:24576 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[4:7] offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:57456 +; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:57440 +; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:57424 +; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:57408 +; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:57344 +; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:57360 +; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:57376 +; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:57392 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31] ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 2 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:32864 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:32880 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:32832 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:32848 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32800 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:32816 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:32768 -; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:32784 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[24:27] offset:32864 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[28:31] offset:32880 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[16:19] offset:32832 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[20:23] offset:32848 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[8:11] offset:32800 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[12:15] offset:32816 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[0:3] offset:32768 +; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[4:7] offset:32784 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: s_endpgm ; @@ -265,120 +266,120 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-ILP: ; %bb.0: ; %entry ; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 -; GCN-ILP-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-ILP-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-ILP-NEXT: v_and_b32_e32 v32, 0x1ff80, v0 +; GCN-ILP-NEXT: v_mov_b32_e32 v33, 1.0 +; GCN-ILP-NEXT: v_mov_b32_e32 v34, 2.0 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v0 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:64 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:80 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-ILP-NEXT: v_add_u32_e32 v35, s0, v32 +; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:48 +; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:32 +; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:16 +; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 +; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:64 +; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:80 +; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:96 +; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:112 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-ILP-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v32, s1, v32 ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 1 -; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:112 -; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:96 -; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:80 -; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:64 -; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:48 -; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32 -; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16 -; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-ILP-NEXT: ds_write_b128 v32, v[28:31] offset:112 +; GCN-ILP-NEXT: ds_write_b128 v32, v[24:27] offset:96 +; GCN-ILP-NEXT: ds_write_b128 v32, v[20:23] offset:80 +; GCN-ILP-NEXT: ds_write_b128 v32, v[16:19] offset:64 +; GCN-ILP-NEXT: ds_write_b128 v32, v[12:15] offset:48 +; GCN-ILP-NEXT: ds_write_b128 v32, v[8:11] offset:32 +; GCN-ILP-NEXT: ds_write_b128 v32, v[4:7] offset:16 +; GCN-ILP-NEXT: ds_write_b128 v32, v[0:3] +; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:8192 +; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:8208 +; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:8224 +; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:8240 +; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:8256 +; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:8272 +; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:8288 +; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:8304 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-ILP-NEXT: v_mov_b32_e32 v0, s1 +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] +; GCN-ILP-NEXT: v_mov_b32_e32 v32, s1 ; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 1 -; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:24576 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:24592 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:24640 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:24656 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:24672 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688 +; GCN-ILP-NEXT: ds_write_b128 v32, v[24:27] offset:8288 +; GCN-ILP-NEXT: ds_write_b128 v32, v[28:31] offset:8304 +; GCN-ILP-NEXT: ds_write_b128 v32, v[16:19] offset:8256 +; GCN-ILP-NEXT: ds_write_b128 v32, v[20:23] offset:8272 +; GCN-ILP-NEXT: ds_write_b128 v32, v[8:11] offset:8224 +; GCN-ILP-NEXT: ds_write_b128 v32, v[12:15] offset:8240 +; GCN-ILP-NEXT: ds_write_b128 v32, v[0:3] offset:8192 +; GCN-ILP-NEXT: ds_write_b128 v32, v[4:7] offset:8208 +; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:24576 +; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:24592 +; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:24608 +; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:24624 +; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:24640 +; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:24656 +; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:24672 +; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:24688 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] ; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 2 -; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16400 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168 -; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:16384 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152 -; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:16432 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200 -; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:16416 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184 -; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:16464 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232 -; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:16448 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216 -; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:16496 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264 -; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:16480 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248 +; GCN-ILP-NEXT: ds_write_b128 v32, v[4:7] offset:16400 +; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:49168 +; GCN-ILP-NEXT: ds_write_b128 v32, v[0:3] offset:16384 +; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:49152 +; GCN-ILP-NEXT: ds_write_b128 v32, v[12:15] offset:16432 +; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:49200 +; GCN-ILP-NEXT: ds_write_b128 v32, v[8:11] offset:16416 +; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:49184 +; GCN-ILP-NEXT: ds_write_b128 v32, v[20:23] offset:16464 +; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:49232 +; GCN-ILP-NEXT: ds_write_b128 v32, v[16:19] offset:16448 +; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:49216 +; GCN-ILP-NEXT: ds_write_b128 v32, v[28:31] offset:16496 +; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:49264 +; GCN-ILP-NEXT: ds_write_b128 v32, v[24:27] offset:16480 +; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:49248 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3 +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v35, 0x6000, v35 ; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 1 -; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:24592 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360 -; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:24576 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344 -; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:24624 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392 -; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:24608 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376 -; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:24656 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424 -; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:24640 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408 -; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:24688 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456 -; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:24672 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440 +; GCN-ILP-NEXT: ds_write_b128 v32, v[4:7] offset:24592 +; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:57360 +; GCN-ILP-NEXT: ds_write_b128 v32, v[0:3] offset:24576 +; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:57344 +; GCN-ILP-NEXT: ds_write_b128 v32, v[12:15] offset:24624 +; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:57392 +; GCN-ILP-NEXT: ds_write_b128 v32, v[8:11] offset:24608 +; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:57376 +; GCN-ILP-NEXT: ds_write_b128 v32, v[20:23] offset:24656 +; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:57424 +; GCN-ILP-NEXT: ds_write_b128 v32, v[16:19] offset:24640 +; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:57408 +; GCN-ILP-NEXT: ds_write_b128 v32, v[28:31] offset:24688 +; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:57456 +; GCN-ILP-NEXT: ds_write_b128 v32, v[24:27] offset:24672 +; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:57440 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 2 -; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-ILP-NEXT: ds_write_b128 v32, v[24:27] offset:32864 +; GCN-ILP-NEXT: ds_write_b128 v32, v[28:31] offset:32880 +; GCN-ILP-NEXT: ds_write_b128 v32, v[16:19] offset:32832 +; GCN-ILP-NEXT: ds_write_b128 v32, v[20:23] offset:32848 +; GCN-ILP-NEXT: ds_write_b128 v32, v[8:11] offset:32800 +; GCN-ILP-NEXT: ds_write_b128 v32, v[12:15] offset:32816 +; GCN-ILP-NEXT: ds_write_b128 v32, v[0:3] offset:32768 +; GCN-ILP-NEXT: ds_write_b128 v32, v[4:7] offset:32784 ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) @@ -455,129 +456,129 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG: ; %bb.0: ; %entry ; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-MINREG-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 -; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-MINREG-NEXT: v_mov_b32_e32 v0, 2.0 +; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v33, 1.0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v32, 2.0 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_add_u32_e32 v3, s0, v2 -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:112 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:96 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:80 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:64 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:16 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:32 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:48 +; GCN-MINREG-NEXT: v_add_u32_e32 v35, s0, v0 +; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v35 offset:112 +; GCN-MINREG-NEXT: v_add_u32_e32 v34, s1, v0 +; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v35 offset:96 +; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v35 offset:80 +; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v35 offset:64 +; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v35 +; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v35 offset:16 +; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v35 offset:32 +; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v35 offset:48 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] -; GCN-MINREG-NEXT: v_add_u32_e32 v2, s1, v2 +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 -; GCN-MINREG-NEXT: s_nop 1 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:112 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:96 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:80 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:64 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:48 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:32 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:8304 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:8288 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:8272 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:8256 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:8240 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:8224 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:8208 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[28:31] offset:112 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[24:27] offset:96 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[20:23] offset:80 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[16:19] offset:64 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[12:15] offset:48 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[8:11] offset:32 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[4:7] offset:16 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[0:3] +; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v35 offset:8304 +; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v35 offset:8288 +; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v35 offset:8272 +; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v35 offset:8256 +; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v35 offset:8240 +; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v35 offset:8224 +; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v35 offset:8208 +; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v35 offset:8192 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] -; GCN-MINREG-NEXT: v_mov_b32_e32 v2, s1 +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31] +; GCN-MINREG-NEXT: v_mov_b32_e32 v34, s1 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 1 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:8288 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:8304 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:8256 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:8272 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:8224 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:8240 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:8192 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:8208 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[24:27] offset:8288 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[28:31] offset:8304 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[16:19] offset:8256 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[20:23] offset:8272 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[8:11] offset:8224 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[12:15] offset:8240 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[0:3] offset:8192 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[4:7] offset:8208 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_barrier mask(0x00000000) -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:24688 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:24672 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:24656 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:24640 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:24576 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:24592 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:24608 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:24624 +; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v35 offset:24688 +; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v35 offset:24672 +; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v35 offset:24656 +; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v35 offset:24640 +; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v35 offset:24576 +; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v35 offset:24592 +; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v35 offset:24608 +; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v35 offset:24624 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] -; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31] +; GCN-MINREG-NEXT: v_add_u32_e32 v36, 0x6000, v35 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 1 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:16496 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:16480 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:16464 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:16448 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:16432 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:16416 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16400 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:16384 -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:49264 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:49248 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:49232 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:49216 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:49200 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:49184 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:49168 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:49152 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[28:31] offset:16496 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[24:27] offset:16480 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[20:23] offset:16464 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[16:19] offset:16448 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[12:15] offset:16432 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[8:11] offset:16416 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[4:7] offset:16400 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[0:3] offset:16384 +; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v35 offset:49264 +; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v35 offset:49248 +; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v35 offset:49232 +; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v35 offset:49216 +; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v35 offset:49200 +; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v35 offset:49184 +; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v35 offset:49168 +; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v35 offset:49152 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:24688 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:24672 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:24656 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:24640 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:24624 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:24608 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:24592 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:24576 -; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:57456 -; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:57440 -; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:57424 -; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:57408 -; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:57344 -; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:57360 -; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:57376 -; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:57392 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[28:31] offset:24688 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[24:27] offset:24672 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[20:23] offset:24656 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[16:19] offset:24640 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[12:15] offset:24624 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[8:11] offset:24608 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[4:7] offset:24592 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[0:3] offset:24576 +; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v36 offset:57456 +; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v36 offset:57440 +; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v36 offset:57424 +; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v36 offset:57408 +; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v36 offset:57344 +; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v36 offset:57360 +; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v36 offset:57376 +; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v36 offset:57392 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:32880 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:32864 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:32848 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:32832 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:32816 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:32800 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:32784 -; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:32768 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[28:31] offset:32880 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[24:27] offset:32864 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[20:23] offset:32848 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[16:19] offset:32832 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[12:15] offset:32816 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[8:11] offset:32800 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[4:7] offset:32784 +; GCN-MINREG-NEXT: ds_write_b128 v34, v[0:3] offset:32768 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: s_endpgm ; @@ -585,129 +586,129 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MAXOCC: ; %bb.0: ; %entry ; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-MAXOCC-NEXT: v_and_b32_e32 v3, 0x1ff80, v0 -; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v35, 0x1ff80, v0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v33, 1.0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v34, 2.0 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v3 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; GCN-MAXOCC-NEXT: v_add_u32_e32 v32, s0, v35 +; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:112 +; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:96 +; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:80 +; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:64 +; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 +; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:16 +; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:32 +; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:48 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-MAXOCC-NEXT: v_add_u32_e32 v3, s1, v3 +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v35, s1, v35 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 1 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:112 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:96 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:80 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:64 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:48 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[28:31] offset:112 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[24:27] offset:96 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[20:23] offset:80 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[16:19] offset:64 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[12:15] offset:48 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[8:11] offset:32 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[4:7] offset:16 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[0:3] +; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:8304 +; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:8288 +; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:8272 +; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:8256 +; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:8240 +; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:8224 +; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:8192 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, s1 +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v35, s1 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 1 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:8288 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:8304 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:8256 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:8272 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:8224 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:8240 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:8192 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:8208 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[24:27] offset:8288 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[28:31] offset:8304 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[16:19] offset:8256 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[20:23] offset:8272 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[8:11] offset:8224 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[12:15] offset:8240 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[0:3] offset:8192 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[4:7] offset:8208 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_barrier mask(0x00000000) -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624 +; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:24688 +; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:24672 +; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:24656 +; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:24640 +; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:24576 +; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:24608 +; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:24624 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 2 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:16496 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:16480 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:16464 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:16448 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:16432 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:16416 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16400 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:16384 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[28:31] offset:16496 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[24:27] offset:16480 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[20:23] offset:16464 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[16:19] offset:16448 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[12:15] offset:16432 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[8:11] offset:16416 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[4:7] offset:16400 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[0:3] offset:16384 +; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:49264 +; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:49248 +; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:49232 +; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:49216 +; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:49200 +; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:49184 +; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:49168 +; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:49152 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0 +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v32, 0x6000, v32 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 1 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:24688 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:24672 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:24656 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:24640 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:24624 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:24608 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:24592 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:24576 -; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456 -; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440 -; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424 -; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408 -; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344 -; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360 -; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376 -; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[28:31] offset:24688 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[24:27] offset:24672 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[20:23] offset:24656 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[16:19] offset:24640 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[12:15] offset:24624 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[8:11] offset:24608 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[4:7] offset:24592 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[0:3] offset:24576 +; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:57456 +; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:57440 +; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:57424 +; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:57408 +; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:57344 +; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:57360 +; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:57376 +; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:57392 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) -; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 2 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:32880 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:32864 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:32848 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:32832 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:32816 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32800 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:32784 -; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:32768 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[28:31] offset:32880 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[24:27] offset:32864 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[20:23] offset:32848 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[16:19] offset:32832 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[12:15] offset:32816 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[8:11] offset:32800 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[4:7] offset:32784 +; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[0:3] offset:32768 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: s_endpgm ; @@ -715,127 +716,127 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-ILP: ; %bb.0: ; %entry ; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-ILP-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 -; GCN-ILP-NEXT: v_mov_b32_e32 v0, 1.0 -; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-ILP-NEXT: v_and_b32_e32 v34, 0x1ff80, v0 +; GCN-ILP-NEXT: v_mov_b32_e32 v32, 1.0 +; GCN-ILP-NEXT: v_mov_b32_e32 v33, 2.0 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v2 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:64 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:80 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-ILP-NEXT: v_add_u32_e32 v35, s0, v34 +; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:48 +; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:32 +; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:16 +; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 +; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:64 +; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:80 +; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:96 +; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:112 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GCN-ILP-NEXT: v_add_u32_e32 v2, s1, v2 +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v34, s1, v34 ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 1 -; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192 -; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208 -; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224 -; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:48 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240 -; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:64 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256 -; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:80 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272 -; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:96 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288 -; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:112 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-ILP-NEXT: ds_write_b128 v34, v[0:3] +; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:8192 +; GCN-ILP-NEXT: ds_write_b128 v34, v[4:7] offset:16 +; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:8208 +; GCN-ILP-NEXT: ds_write_b128 v34, v[8:11] offset:32 +; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:8224 +; GCN-ILP-NEXT: ds_write_b128 v34, v[12:15] offset:48 +; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:8240 +; GCN-ILP-NEXT: ds_write_b128 v34, v[16:19] offset:64 +; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:8256 +; GCN-ILP-NEXT: ds_write_b128 v34, v[20:23] offset:80 +; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:8272 +; GCN-ILP-NEXT: ds_write_b128 v34, v[24:27] offset:96 +; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:8288 +; GCN-ILP-NEXT: ds_write_b128 v34, v[28:31] offset:112 +; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:8304 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GCN-ILP-NEXT: v_mov_b32_e32 v2, s1 +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31] +; GCN-ILP-NEXT: v_mov_b32_e32 v34, s1 ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 1 -; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:8288 -; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:8304 -; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:8256 -; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:8272 -; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:8224 -; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:8240 -; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:8192 -; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:8208 +; GCN-ILP-NEXT: ds_write_b128 v34, v[24:27] offset:8288 +; GCN-ILP-NEXT: ds_write_b128 v34, v[28:31] offset:8304 +; GCN-ILP-NEXT: ds_write_b128 v34, v[16:19] offset:8256 +; GCN-ILP-NEXT: ds_write_b128 v34, v[20:23] offset:8272 +; GCN-ILP-NEXT: ds_write_b128 v34, v[8:11] offset:8224 +; GCN-ILP-NEXT: ds_write_b128 v34, v[12:15] offset:8240 +; GCN-ILP-NEXT: ds_write_b128 v34, v[0:3] offset:8192 +; GCN-ILP-NEXT: ds_write_b128 v34, v[4:7] offset:8208 ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_barrier mask(0x00000000) -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:24592 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:24576 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:24640 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:24656 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:24672 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688 +; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:24624 +; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:24608 +; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:24592 +; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:24576 +; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:24640 +; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:24656 +; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:24672 +; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:24688 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31] ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 2 -; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:16496 -; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:16480 -; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:16464 -; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:16448 -; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:16432 -; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:16416 -; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16400 -; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:16384 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264 +; GCN-ILP-NEXT: ds_write_b128 v34, v[28:31] offset:16496 +; GCN-ILP-NEXT: ds_write_b128 v34, v[24:27] offset:16480 +; GCN-ILP-NEXT: ds_write_b128 v34, v[20:23] offset:16464 +; GCN-ILP-NEXT: ds_write_b128 v34, v[16:19] offset:16448 +; GCN-ILP-NEXT: ds_write_b128 v34, v[12:15] offset:16432 +; GCN-ILP-NEXT: ds_write_b128 v34, v[8:11] offset:16416 +; GCN-ILP-NEXT: ds_write_b128 v34, v[4:7] offset:16400 +; GCN-ILP-NEXT: ds_write_b128 v34, v[0:3] offset:16384 +; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:49152 +; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:49168 +; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:49184 +; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:49200 +; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:49216 +; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:49232 +; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:49248 +; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:49264 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3 +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v35, 0x6000, v35 ; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 1 -; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:24576 -; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344 -; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:24592 -; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360 -; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:24608 -; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376 -; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:24624 -; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392 -; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:24640 -; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408 -; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:24656 -; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424 -; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:24672 -; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440 -; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:24688 -; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456 +; GCN-ILP-NEXT: ds_write_b128 v34, v[0:3] offset:24576 +; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:57344 +; GCN-ILP-NEXT: ds_write_b128 v34, v[4:7] offset:24592 +; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:57360 +; GCN-ILP-NEXT: ds_write_b128 v34, v[8:11] offset:24608 +; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:57376 +; GCN-ILP-NEXT: ds_write_b128 v34, v[12:15] offset:24624 +; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:57392 +; GCN-ILP-NEXT: ds_write_b128 v34, v[16:19] offset:24640 +; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:57408 +; GCN-ILP-NEXT: ds_write_b128 v34, v[20:23] offset:24656 +; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:57424 +; GCN-ILP-NEXT: ds_write_b128 v34, v[24:27] offset:24672 +; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:57440 +; GCN-ILP-NEXT: ds_write_b128 v34, v[28:31] offset:24688 +; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:57456 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) -; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31] ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 2 -; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:32880 -; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:32864 -; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:32848 -; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:32832 -; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:32816 -; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32800 -; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:32784 -; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:32768 +; GCN-ILP-NEXT: ds_write_b128 v34, v[28:31] offset:32880 +; GCN-ILP-NEXT: ds_write_b128 v34, v[24:27] offset:32864 +; GCN-ILP-NEXT: ds_write_b128 v34, v[20:23] offset:32848 +; GCN-ILP-NEXT: ds_write_b128 v34, v[16:19] offset:32832 +; GCN-ILP-NEXT: ds_write_b128 v34, v[12:15] offset:32816 +; GCN-ILP-NEXT: ds_write_b128 v34, v[8:11] offset:32800 +; GCN-ILP-NEXT: ds_write_b128 v34, v[4:7] offset:32784 +; GCN-ILP-NEXT: ds_write_b128 v34, v[0:3] offset:32768 ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index aa099b6..11d0099 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched-cluster=0 -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched-cluster=0 -amdgpu-mfma-vgpr-form=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s define amdgpu_kernel void @test_sched_group_barrier() #0 { ; GCN-LABEL: test_sched_group_barrier: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 6eb9449..abf741c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s + +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN-VGPR,SDAG-VGPR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN-VGPR,GISEL-VGPR %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -62,6 +65,58 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7] ; GISEL-NEXT: s_endpgm +; +; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__vgpr: +; SDAG-VGPR: ; %bb.0: ; %bb +; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; SDAG-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) +; SDAG-VGPR-NEXT: s_nop 0 +; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-VGPR-NEXT: s_nop 7 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-VGPR-NEXT: s_endpgm +; +; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__vgpr: +; GISEL-VGPR: ; %bb.0: ; %bb +; GISEL-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GISEL-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GISEL-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, s16 +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) +; GISEL-VGPR-NEXT: s_nop 0 +; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-VGPR-NEXT: s_nop 6 +; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7] +; GISEL-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id @@ -82,6 +137,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> % ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_f16: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } @@ -97,6 +163,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__flags0: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } @@ -112,6 +189,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__flags1: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -163,6 +251,48 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, < ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__sgpr: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-VGPR-NEXT: s_nop 1 +; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v4 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__sgpr: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } @@ -237,6 +367,70 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; GISEL-NEXT: s_endpgm +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__vgpr: +; SDAG-VGPR: ; %bb.0: ; %bb +; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3] +; SDAG-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1] +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) +; SDAG-VGPR-NEXT: s_nop 0 +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-VGPR-NEXT: s_nop 10 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 +; SDAG-VGPR-NEXT: s_endpgm +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__vgpr: +; GISEL-VGPR: ; %bb.0: ; %bb +; GISEL-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GISEL-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 +; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 +; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 +; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GISEL-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1] +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s16 +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) +; GISEL-VGPR-NEXT: s_nop 0 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-VGPR-NEXT: s_nop 10 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 +; GISEL-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id @@ -304,6 +498,64 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x32_f16: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x32_f16: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } @@ -366,6 +618,64 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__flags0: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__flags0: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } @@ -428,6 +738,64 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__flags1: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__flags1: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result } @@ -524,6 +892,82 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23 +; SDAG-VGPR-NEXT: s_nop 1 +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } @@ -560,6 +1004,32 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; GCN-NEXT: s_endpgm +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr: +; GCN-VGPR: ; %bb.0: ; %bb +; GCN-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; GCN-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; GCN-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GCN-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; GCN-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GCN-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GCN-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GCN-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GCN-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GCN-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GCN-VGPR-NEXT: v_mov_b32_e32 v17, s16 +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) +; GCN-VGPR-NEXT: s_nop 0 +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; GCN-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id @@ -580,6 +1050,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bflo ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } @@ -595,6 +1076,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <1 ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16__flags0: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } @@ -610,6 +1102,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <1 ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16__flags1: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -643,6 +1146,30 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_mov_b32_e32 v14, s0 +; GCN-VGPR-NEXT: v_mov_b32_e32 v15, s1 +; GCN-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; GCN-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; GCN-VGPR-NEXT: v_mov_b32_e32 v6, s16 +; GCN-VGPR-NEXT: v_mov_b32_e32 v7, s17 +; GCN-VGPR-NEXT: v_mov_b32_e32 v8, s18 +; GCN-VGPR-NEXT: v_mov_b32_e32 v9, s19 +; GCN-VGPR-NEXT: v_mov_b32_e32 v10, s20 +; GCN-VGPR-NEXT: v_mov_b32_e32 v11, s21 +; GCN-VGPR-NEXT: v_mov_b32_e32 v12, s22 +; GCN-VGPR-NEXT: v_mov_b32_e32 v13, s23 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GCN-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; GCN-VGPR-NEXT: s_nop 1 +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v4 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } @@ -685,6 +1212,38 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] ; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 ; GCN-NEXT: s_endpgm +; +; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr: +; GCN-VGPR: ; %bb.0: ; %bb +; GCN-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GCN-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 +; GCN-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 +; GCN-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 +; GCN-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; GCN-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GCN-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; GCN-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3] +; GCN-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1] +; GCN-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GCN-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GCN-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GCN-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GCN-VGPR-NEXT: v_mov_b32_e32 v28, s16 +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) +; GCN-VGPR-NEXT: s_nop 0 +; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; GCN-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; GCN-VGPR-NEXT: s_nop 10 +; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 +; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 +; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 +; GCN-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id @@ -717,6 +1276,29 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl ; GCN-NEXT: v_mov_b32_e32 v14, v26 ; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 +; GCN-VGPR-NEXT: s_nop 11 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } @@ -744,6 +1326,29 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, < ; GCN-NEXT: v_mov_b32_e32 v14, v26 ; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16__flags0: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-VGPR-NEXT: s_nop 11 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } @@ -771,6 +1376,29 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, < ; GCN-NEXT: v_mov_b32_e32 v14, v26 ; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16__flags1: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-VGPR-NEXT: s_nop 11 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result } @@ -827,6 +1455,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg ; GCN-NEXT: v_mov_b32_e32 v14, v26 ; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_mov_b32_e32 v26, s0 +; GCN-VGPR-NEXT: v_mov_b32_e32 v27, s1 +; GCN-VGPR-NEXT: v_mov_b32_e32 v28, s2 +; GCN-VGPR-NEXT: v_mov_b32_e32 v29, s3 +; GCN-VGPR-NEXT: v_mov_b32_e32 v16, v10 +; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v9 +; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v8 +; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v4 +; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v3 +; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v0 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GCN-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; GCN-VGPR-NEXT: v_mov_b32_e32 v5, s29 +; GCN-VGPR-NEXT: v_mov_b32_e32 v18, s16 +; GCN-VGPR-NEXT: v_mov_b32_e32 v19, s17 +; GCN-VGPR-NEXT: v_mov_b32_e32 v20, s18 +; GCN-VGPR-NEXT: v_mov_b32_e32 v21, s19 +; GCN-VGPR-NEXT: v_mov_b32_e32 v22, s20 +; GCN-VGPR-NEXT: v_mov_b32_e32 v23, s21 +; GCN-VGPR-NEXT: v_mov_b32_e32 v24, s22 +; GCN-VGPR-NEXT: v_mov_b32_e32 v25, s23 +; GCN-VGPR-NEXT: s_nop 1 +; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } @@ -895,6 +1559,64 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm +; +; SDAG-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__vgpr: +; SDAG-VGPR: ; %bb.0: ; %bb +; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) +; SDAG-VGPR-NEXT: s_nop 0 +; SDAG-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-VGPR-NEXT: s_nop 7 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-VGPR-NEXT: s_endpgm +; +; GISEL-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__vgpr: +; GISEL-VGPR: ; %bb.0: ; %bb +; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, s2 +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) +; GISEL-VGPR-NEXT: s_nop 0 +; GISEL-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-VGPR-NEXT: s_nop 6 +; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] +; GISEL-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i32>, ptr addrspace(1) %arg, i32 %id @@ -915,6 +1637,17 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_i32_16x16x128_i8: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x i32> %result } @@ -930,6 +1663,17 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32 ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__flags0: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x i32> %result } @@ -945,6 +1689,17 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32 ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__flags1: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x i32> %result } @@ -996,6 +1751,48 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__sgpr: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-VGPR-NEXT: s_nop 1 +; SDAG-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[14:17], v[6:13], v4 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__sgpr: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x i32> %result } @@ -1076,6 +1873,76 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm +; +; SDAG-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__vgpr: +; SDAG-VGPR: ; %bb.0: ; %bb +; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) +; SDAG-VGPR-NEXT: s_nop 0 +; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-VGPR-NEXT: s_nop 10 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 +; SDAG-VGPR-NEXT: s_endpgm +; +; GISEL-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__vgpr: +; GISEL-VGPR: ; %bb.0: ; %bb +; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 +; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 +; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) +; GISEL-VGPR-NEXT: s_nop 0 +; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-VGPR-NEXT: s_nop 10 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <16 x i32>, ptr addrspace(1) %arg, i32 %id @@ -1143,6 +2010,64 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_i32_32x32x64_i8: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_i32_32x32x64_i8: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x i32> %result } @@ -1205,6 +2130,64 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__flags0: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__flags0: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x i32> %result } @@ -1267,6 +2250,64 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__flags1: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__flags1: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x i32> %result } @@ -1363,6 +2404,82 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23 +; SDAG-VGPR-NEXT: s_nop 1 +; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x i32> %result } @@ -1431,6 +2548,64 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm +; +; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr: +; SDAG-VGPR: ; %bb.0: ; %bb +; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) +; SDAG-VGPR-NEXT: s_nop 0 +; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-VGPR-NEXT: s_nop 7 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-VGPR-NEXT: s_endpgm +; +; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr: +; GISEL-VGPR: ; %bb.0: ; %bb +; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, s2 +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) +; GISEL-VGPR-NEXT: s_nop 0 +; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-VGPR-NEXT: s_nop 6 +; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] +; GISEL-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id @@ -1451,6 +2626,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } @@ -1466,6 +2652,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, < ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } @@ -1481,6 +2678,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, < ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -1532,6 +2740,48 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-VGPR-NEXT: s_nop 1 +; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[14:17], v[6:13], v4 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } @@ -1600,6 +2850,64 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm +; +; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr: +; SDAG-VGPR: ; %bb.0: ; %bb +; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) +; SDAG-VGPR-NEXT: s_nop 0 +; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-VGPR-NEXT: s_nop 7 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-VGPR-NEXT: s_endpgm +; +; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr: +; GISEL-VGPR: ; %bb.0: ; %bb +; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, s2 +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) +; GISEL-VGPR-NEXT: s_nop 0 +; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-VGPR-NEXT: s_nop 6 +; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] +; GISEL-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id @@ -1620,6 +2928,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } @@ -1635,6 +2954,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, < ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } @@ -1650,6 +2980,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, < ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -1701,6 +3042,48 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-VGPR-NEXT: s_nop 1 +; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[14:17], v[6:13], v4 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } @@ -1769,6 +3152,64 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm +; +; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr: +; SDAG-VGPR: ; %bb.0: ; %bb +; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) +; SDAG-VGPR-NEXT: s_nop 0 +; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-VGPR-NEXT: s_nop 7 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-VGPR-NEXT: s_endpgm +; +; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr: +; GISEL-VGPR: ; %bb.0: ; %bb +; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, s2 +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) +; GISEL-VGPR-NEXT: s_nop 0 +; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-VGPR-NEXT: s_nop 6 +; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] +; GISEL-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id @@ -1789,6 +3230,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } @@ -1804,6 +3256,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, < ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } @@ -1819,6 +3282,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, < ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -1870,6 +3344,48 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-VGPR-NEXT: s_nop 1 +; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[14:17], v[6:13], v4 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } @@ -1938,6 +3454,64 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm +; +; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr: +; SDAG-VGPR: ; %bb.0: ; %bb +; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) +; SDAG-VGPR-NEXT: s_nop 0 +; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-VGPR-NEXT: s_nop 7 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-VGPR-NEXT: s_endpgm +; +; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr: +; GISEL-VGPR: ; %bb.0: ; %bb +; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, s2 +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) +; GISEL-VGPR-NEXT: s_nop 0 +; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-VGPR-NEXT: s_nop 6 +; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] +; GISEL-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id @@ -1958,6 +3532,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } @@ -1973,6 +3558,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, < ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } @@ -1988,6 +3584,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, < ; GCN-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1: +; GCN-VGPR: ; %bb.0: +; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-VGPR-NEXT: s_nop 7 +; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GCN-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -2039,6 +3646,48 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-VGPR-NEXT: s_nop 1 +; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[14:17], v[6:13], v4 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } @@ -2119,6 +3768,76 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr: +; SDAG-VGPR: ; %bb.0: ; %bb +; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) +; SDAG-VGPR-NEXT: s_nop 0 +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-VGPR-NEXT: s_nop 10 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 +; SDAG-VGPR-NEXT: s_endpgm +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr: +; GISEL-VGPR: ; %bb.0: ; %bb +; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 +; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 +; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) +; GISEL-VGPR-NEXT: s_nop 0 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-VGPR-NEXT: s_nop 10 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id @@ -2186,6 +3905,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } @@ -2248,6 +4025,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, < ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } @@ -2310,6 +4145,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result } @@ -2406,6 +4299,82 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23 +; SDAG-VGPR-NEXT: s_nop 1 +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } @@ -2486,6 +4455,76 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr: +; SDAG-VGPR: ; %bb.0: ; %bb +; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) +; SDAG-VGPR-NEXT: s_nop 0 +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-VGPR-NEXT: s_nop 10 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 +; SDAG-VGPR-NEXT: s_endpgm +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr: +; GISEL-VGPR: ; %bb.0: ; %bb +; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 +; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 +; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) +; GISEL-VGPR-NEXT: s_nop 0 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-VGPR-NEXT: s_nop 10 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id @@ -2553,6 +4592,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } @@ -2615,6 +4712,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, < ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } @@ -2677,6 +4832,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result } @@ -2773,6 +4986,82 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23 +; SDAG-VGPR-NEXT: s_nop 1 +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } @@ -2853,6 +5142,76 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr: +; SDAG-VGPR: ; %bb.0: ; %bb +; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) +; SDAG-VGPR-NEXT: s_nop 0 +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-VGPR-NEXT: s_nop 10 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 +; SDAG-VGPR-NEXT: s_endpgm +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr: +; GISEL-VGPR: ; %bb.0: ; %bb +; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 +; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 +; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) +; GISEL-VGPR-NEXT: s_nop 0 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-VGPR-NEXT: s_nop 10 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id @@ -2920,6 +5279,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } @@ -2982,6 +5399,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, < ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } @@ -3044,6 +5519,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result } @@ -3140,6 +5673,82 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23 +; SDAG-VGPR-NEXT: s_nop 1 +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } @@ -3220,6 +5829,76 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr: +; SDAG-VGPR: ; %bb.0: ; %bb +; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 +; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) +; SDAG-VGPR-NEXT: s_nop 0 +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-VGPR-NEXT: s_nop 10 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 +; SDAG-VGPR-NEXT: s_endpgm +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr: +; GISEL-VGPR: ; %bb.0: ; %bb +; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 +; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 +; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) +; GISEL-VGPR-NEXT: s_nop 0 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-VGPR-NEXT: s_nop 10 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-VGPR-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id @@ -3287,6 +5966,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } @@ -3349,6 +6086,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, < ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } @@ -3411,6 +6206,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-VGPR-NEXT: s_nop 11 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result } @@ -3507,6 +6360,82 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: +; SDAG-VGPR: ; %bb.0: +; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23 +; SDAG-VGPR-NEXT: s_nop 1 +; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 +; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: +; GISEL-VGPR: ; %bb.0: +; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v20 +; GISEL-VGPR-NEXT: s_nop 1 +; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21 +; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 0af655df..b7efe1e 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -101,120 +101,120 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_zeroinit: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v0 +; GFX90A-NEXT: v_mov_b32_e32 v19, v0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v0 +; GFX90A-NEXT: v_mov_b32_e32 v21, v0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v0 +; GFX90A-NEXT: v_mov_b32_e32 v23, v0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v0 +; GFX90A-NEXT: v_mov_b32_e32 v25, v0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v0 +; GFX90A-NEXT: v_mov_b32_e32 v27, v0 +; GFX90A-NEXT: v_mov_b32_e32 v28, v0 +; GFX90A-NEXT: v_mov_b32_e32 v29, v0 +; GFX90A-NEXT: v_mov_b32_e32 v30, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 12 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_zeroinit: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v0 +; GFX942-NEXT: v_mov_b32_e32 v18, v0 +; GFX942-NEXT: v_mov_b32_e32 v19, v0 +; GFX942-NEXT: v_mov_b32_e32 v20, v0 +; GFX942-NEXT: v_mov_b32_e32 v21, v0 +; GFX942-NEXT: v_mov_b32_e32 v22, v0 +; GFX942-NEXT: v_mov_b32_e32 v23, v0 +; GFX942-NEXT: v_mov_b32_e32 v24, v0 +; GFX942-NEXT: v_mov_b32_e32 v25, v0 +; GFX942-NEXT: v_mov_b32_e32 v26, v0 +; GFX942-NEXT: v_mov_b32_e32 v27, v0 +; GFX942-NEXT: v_mov_b32_e32 v28, v0 +; GFX942-NEXT: v_mov_b32_e32 v29, v0 +; GFX942-NEXT: v_mov_b32_e32 v30, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, v0 ; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 11 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: br label %for.cond.preheader @@ -333,121 +333,119 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX90A-LABEL: test_mfma_loop_unfoldable_splat: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v0 +; GFX90A-NEXT: v_mov_b32_e32 v19, v0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v0 +; GFX90A-NEXT: v_mov_b32_e32 v21, v0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v0 +; GFX90A-NEXT: v_mov_b32_e32 v23, v0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v0 +; GFX90A-NEXT: v_mov_b32_e32 v25, v0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v0 +; GFX90A-NEXT: v_mov_b32_e32 v27, v0 +; GFX90A-NEXT: v_mov_b32_e32 v28, v0 +; GFX90A-NEXT: v_mov_b32_e32 v29, v0 +; GFX90A-NEXT: v_mov_b32_e32 v30, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 12 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_unfoldable_splat: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v0 +; GFX942-NEXT: v_mov_b32_e32 v18, v0 +; GFX942-NEXT: v_mov_b32_e32 v19, v0 +; GFX942-NEXT: v_mov_b32_e32 v20, v0 +; GFX942-NEXT: v_mov_b32_e32 v21, v0 +; GFX942-NEXT: v_mov_b32_e32 v22, v0 +; GFX942-NEXT: v_mov_b32_e32 v23, v0 +; GFX942-NEXT: v_mov_b32_e32 v24, v0 +; GFX942-NEXT: v_mov_b32_e32 v25, v0 +; GFX942-NEXT: v_mov_b32_e32 v26, v0 +; GFX942-NEXT: v_mov_b32_e32 v27, v0 +; GFX942-NEXT: v_mov_b32_e32 v28, v0 +; GFX942-NEXT: v_mov_b32_e32 v29, v0 +; GFX942-NEXT: v_mov_b32_e32 v30, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, v0 ; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 11 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: br label %for.cond.preheader @@ -559,120 +557,120 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_non_splat: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v32 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v0 +; GFX90A-NEXT: v_mov_b32_e32 v19, v0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v0 +; GFX90A-NEXT: v_mov_b32_e32 v21, v0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v0 +; GFX90A-NEXT: v_mov_b32_e32 v23, v0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v0 +; GFX90A-NEXT: v_mov_b32_e32 v25, v0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v0 +; GFX90A-NEXT: v_mov_b32_e32 v27, v0 +; GFX90A-NEXT: v_mov_b32_e32 v28, v0 +; GFX90A-NEXT: v_mov_b32_e32 v29, v0 +; GFX90A-NEXT: v_mov_b32_e32 v30, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 12 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_non_splat: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v33, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, v32 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v0 +; GFX942-NEXT: v_mov_b32_e32 v18, v0 +; GFX942-NEXT: v_mov_b32_e32 v19, v0 +; GFX942-NEXT: v_mov_b32_e32 v20, v0 +; GFX942-NEXT: v_mov_b32_e32 v21, v0 +; GFX942-NEXT: v_mov_b32_e32 v22, v0 +; GFX942-NEXT: v_mov_b32_e32 v23, v0 +; GFX942-NEXT: v_mov_b32_e32 v24, v0 +; GFX942-NEXT: v_mov_b32_e32 v25, v0 +; GFX942-NEXT: v_mov_b32_e32 v26, v0 +; GFX942-NEXT: v_mov_b32_e32 v27, v0 +; GFX942-NEXT: v_mov_b32_e32 v28, v0 +; GFX942-NEXT: v_mov_b32_e32 v29, v0 +; GFX942-NEXT: v_mov_b32_e32 v30, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, v0 ; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 11 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: br label %for.cond.preheader @@ -821,184 +819,120 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_seq: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x431a0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43190000 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43180000 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43170000 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43160000 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43150000 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43140000 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43130000 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43120000 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43110000 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43100000 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430f0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430e0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430d0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430c0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430b0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430a0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43090000 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43080000 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43070000 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43060000 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43050000 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43040000 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43030000 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43020000 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43010000 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43000000 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fe0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fc0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fa0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f80000 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0x431a0000 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0x43190000 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0x43180000 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0x43170000 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0x43160000 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0x43150000 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0x43140000 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0x43130000 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0x43120000 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0x43110000 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0x43100000 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0x430f0000 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0x430e0000 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0x430d0000 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0x430c0000 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0x430b0000 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0x430a0000 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0x43090000 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0x43080000 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0x43070000 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0x43060000 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0x43050000 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0x43040000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x43030000 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x43020000 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x43010000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x43000000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x42fe0000 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x42fc0000 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x42fa0000 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f80000 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 ; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 12 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_unfoldable_seq: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_mov_b32_e32 v0, 0x431a0000 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43190000 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43180000 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43170000 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43160000 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43150000 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43140000 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43130000 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43120000 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43110000 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43100000 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430f0000 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430e0000 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430d0000 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430c0000 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430b0000 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430a0000 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43090000 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43080000 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43070000 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43060000 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43050000 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43040000 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43030000 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43020000 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43010000 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43000000 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fe0000 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fc0000 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fa0000 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f80000 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, 0x431a0000 +; GFX942-NEXT: v_mov_b32_e32 v30, 0x43190000 +; GFX942-NEXT: v_mov_b32_e32 v29, 0x43180000 +; GFX942-NEXT: v_mov_b32_e32 v28, 0x43170000 +; GFX942-NEXT: v_mov_b32_e32 v27, 0x43160000 +; GFX942-NEXT: v_mov_b32_e32 v26, 0x43150000 +; GFX942-NEXT: v_mov_b32_e32 v25, 0x43140000 +; GFX942-NEXT: v_mov_b32_e32 v24, 0x43130000 +; GFX942-NEXT: v_mov_b32_e32 v23, 0x43120000 +; GFX942-NEXT: v_mov_b32_e32 v22, 0x43110000 +; GFX942-NEXT: v_mov_b32_e32 v21, 0x43100000 +; GFX942-NEXT: v_mov_b32_e32 v20, 0x430f0000 +; GFX942-NEXT: v_mov_b32_e32 v19, 0x430e0000 +; GFX942-NEXT: v_mov_b32_e32 v18, 0x430d0000 +; GFX942-NEXT: v_mov_b32_e32 v17, 0x430c0000 +; GFX942-NEXT: v_mov_b32_e32 v16, 0x430b0000 +; GFX942-NEXT: v_mov_b32_e32 v15, 0x430a0000 +; GFX942-NEXT: v_mov_b32_e32 v14, 0x43090000 +; GFX942-NEXT: v_mov_b32_e32 v13, 0x43080000 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x43070000 +; GFX942-NEXT: v_mov_b32_e32 v11, 0x43060000 +; GFX942-NEXT: v_mov_b32_e32 v10, 0x43050000 +; GFX942-NEXT: v_mov_b32_e32 v9, 0x43040000 +; GFX942-NEXT: v_mov_b32_e32 v8, 0x43030000 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x43020000 +; GFX942-NEXT: v_mov_b32_e32 v6, 0x43010000 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x43000000 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x42fe0000 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x42fc0000 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x42fa0000 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f80000 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 ; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 11 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: br label %for.cond.preheader @@ -1111,121 +1045,119 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A-LABEL: test_mfma_loop_vgpr_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v0 +; GFX90A-NEXT: v_mov_b32_e32 v19, v0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v0 +; GFX90A-NEXT: v_mov_b32_e32 v21, v0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v0 +; GFX90A-NEXT: v_mov_b32_e32 v23, v0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v0 +; GFX90A-NEXT: v_mov_b32_e32 v25, v0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v0 +; GFX90A-NEXT: v_mov_b32_e32 v27, v0 +; GFX90A-NEXT: v_mov_b32_e32 v28, v0 +; GFX90A-NEXT: v_mov_b32_e32 v29, v0 +; GFX90A-NEXT: v_mov_b32_e32 v30, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 12 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_vgpr_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v0 +; GFX942-NEXT: v_mov_b32_e32 v18, v0 +; GFX942-NEXT: v_mov_b32_e32 v19, v0 +; GFX942-NEXT: v_mov_b32_e32 v20, v0 +; GFX942-NEXT: v_mov_b32_e32 v21, v0 +; GFX942-NEXT: v_mov_b32_e32 v22, v0 +; GFX942-NEXT: v_mov_b32_e32 v23, v0 +; GFX942-NEXT: v_mov_b32_e32 v24, v0 +; GFX942-NEXT: v_mov_b32_e32 v25, v0 +; GFX942-NEXT: v_mov_b32_e32 v26, v0 +; GFX942-NEXT: v_mov_b32_e32 v27, v0 +; GFX942-NEXT: v_mov_b32_e32 v28, v0 +; GFX942-NEXT: v_mov_b32_e32 v29, v0 +; GFX942-NEXT: v_mov_b32_e32 v30, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, v0 ; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 11 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1378,124 +1310,122 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s1 +; GFX90A-NEXT: v_mov_b32_e32 v5, s1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s1 +; GFX90A-NEXT: v_mov_b32_e32 v7, s1 +; GFX90A-NEXT: v_mov_b32_e32 v8, s1 +; GFX90A-NEXT: v_mov_b32_e32 v9, s1 +; GFX90A-NEXT: v_mov_b32_e32 v10, s1 +; GFX90A-NEXT: v_mov_b32_e32 v11, s1 +; GFX90A-NEXT: v_mov_b32_e32 v12, s1 +; GFX90A-NEXT: v_mov_b32_e32 v13, s1 +; GFX90A-NEXT: v_mov_b32_e32 v14, s1 +; GFX90A-NEXT: v_mov_b32_e32 v15, s1 +; GFX90A-NEXT: v_mov_b32_e32 v16, s1 +; GFX90A-NEXT: v_mov_b32_e32 v17, s1 +; GFX90A-NEXT: v_mov_b32_e32 v18, s1 +; GFX90A-NEXT: v_mov_b32_e32 v19, s1 +; GFX90A-NEXT: v_mov_b32_e32 v20, s1 +; GFX90A-NEXT: v_mov_b32_e32 v21, s1 +; GFX90A-NEXT: v_mov_b32_e32 v22, s1 +; GFX90A-NEXT: v_mov_b32_e32 v23, s1 +; GFX90A-NEXT: v_mov_b32_e32 v24, s1 +; GFX90A-NEXT: v_mov_b32_e32 v25, s1 +; GFX90A-NEXT: v_mov_b32_e32 v26, s1 +; GFX90A-NEXT: v_mov_b32_e32 v27, s1 +; GFX90A-NEXT: v_mov_b32_e32 v28, s1 +; GFX90A-NEXT: v_mov_b32_e32 v29, s1 +; GFX90A-NEXT: v_mov_b32_e32 v30, s1 +; GFX90A-NEXT: v_mov_b32_e32 v31, s1 ; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 12 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_sgpr_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NEXT: v_mov_b32_e32 v3, s1 +; GFX942-NEXT: v_mov_b32_e32 v4, s1 +; GFX942-NEXT: v_mov_b32_e32 v5, s1 +; GFX942-NEXT: v_mov_b32_e32 v6, s1 +; GFX942-NEXT: v_mov_b32_e32 v7, s1 +; GFX942-NEXT: v_mov_b32_e32 v8, s1 +; GFX942-NEXT: v_mov_b32_e32 v9, s1 +; GFX942-NEXT: v_mov_b32_e32 v10, s1 +; GFX942-NEXT: v_mov_b32_e32 v11, s1 +; GFX942-NEXT: v_mov_b32_e32 v12, s1 +; GFX942-NEXT: v_mov_b32_e32 v13, s1 +; GFX942-NEXT: v_mov_b32_e32 v14, s1 +; GFX942-NEXT: v_mov_b32_e32 v15, s1 +; GFX942-NEXT: v_mov_b32_e32 v16, s1 +; GFX942-NEXT: v_mov_b32_e32 v17, s1 +; GFX942-NEXT: v_mov_b32_e32 v18, s1 +; GFX942-NEXT: v_mov_b32_e32 v19, s1 +; GFX942-NEXT: v_mov_b32_e32 v20, s1 +; GFX942-NEXT: v_mov_b32_e32 v21, s1 +; GFX942-NEXT: v_mov_b32_e32 v22, s1 +; GFX942-NEXT: v_mov_b32_e32 v23, s1 +; GFX942-NEXT: v_mov_b32_e32 v24, s1 +; GFX942-NEXT: v_mov_b32_e32 v25, s1 +; GFX942-NEXT: v_mov_b32_e32 v26, s1 +; GFX942-NEXT: v_mov_b32_e32 v27, s1 +; GFX942-NEXT: v_mov_b32_e32 v28, s1 +; GFX942-NEXT: v_mov_b32_e32 v29, s1 +; GFX942-NEXT: v_mov_b32_e32 v30, s1 +; GFX942-NEXT: v_mov_b32_e32 v31, s1 ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 11 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: %tmp0 = insertelement <32 x float> poison, float %init, i32 0 @@ -1644,127 +1574,123 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-LABEL: test_mfma_loop_mixed_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: v_mov_b32_e32 v12, v2 +; GFX90A-NEXT: v_mov_b32_e32 v13, v2 +; GFX90A-NEXT: v_mov_b32_e32 v14, v2 +; GFX90A-NEXT: v_mov_b32_e32 v15, v2 +; GFX90A-NEXT: v_mov_b32_e32 v16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v17, v2 +; GFX90A-NEXT: v_mov_b32_e32 v18, v2 +; GFX90A-NEXT: v_mov_b32_e32 v19, v2 +; GFX90A-NEXT: v_mov_b32_e32 v20, v2 +; GFX90A-NEXT: v_mov_b32_e32 v21, v2 +; GFX90A-NEXT: v_mov_b32_e32 v22, v2 +; GFX90A-NEXT: v_mov_b32_e32 v23, v2 +; GFX90A-NEXT: v_mov_b32_e32 v24, v2 +; GFX90A-NEXT: v_mov_b32_e32 v25, v2 +; GFX90A-NEXT: v_mov_b32_e32 v26, v2 +; GFX90A-NEXT: v_mov_b32_e32 v27, v2 +; GFX90A-NEXT: v_mov_b32_e32 v28, v2 +; GFX90A-NEXT: v_mov_b32_e32 v29, v2 +; GFX90A-NEXT: v_mov_b32_e32 v30, v2 +; GFX90A-NEXT: v_mov_b32_e32 v31, v2 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 12 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_mixed_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-NEXT: v_mov_b32_e32 v14, v2 +; GFX942-NEXT: v_mov_b32_e32 v15, v2 +; GFX942-NEXT: v_mov_b32_e32 v16, v2 +; GFX942-NEXT: v_mov_b32_e32 v17, v2 +; GFX942-NEXT: v_mov_b32_e32 v18, v2 +; GFX942-NEXT: v_mov_b32_e32 v19, v2 +; GFX942-NEXT: v_mov_b32_e32 v20, v2 +; GFX942-NEXT: v_mov_b32_e32 v21, v2 +; GFX942-NEXT: v_mov_b32_e32 v22, v2 +; GFX942-NEXT: v_mov_b32_e32 v23, v2 +; GFX942-NEXT: v_mov_b32_e32 v24, v2 +; GFX942-NEXT: v_mov_b32_e32 v25, v2 +; GFX942-NEXT: v_mov_b32_e32 v26, v2 +; GFX942-NEXT: v_mov_b32_e32 v27, v2 +; GFX942-NEXT: v_mov_b32_e32 v28, v2 +; GFX942-NEXT: v_mov_b32_e32 v29, v2 +; GFX942-NEXT: v_mov_b32_e32 v30, v2 +; GFX942-NEXT: v_mov_b32_e32 v31, v2 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 11 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1882,58 +1808,58 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar ; ; GFX90A-LABEL: test_mfma_loop_mfma_forward_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 2.0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, 0 ; GFX90A-NEXT: .LBB7_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 12 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_mfma_forward_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v33, 2.0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0 ; GFX942-NEXT: .LBB7_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 11 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) @@ -2087,126 +2013,126 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_agpr_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 2.0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v0 +; GFX90A-NEXT: v_mov_b32_e32 v19, v0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v0 +; GFX90A-NEXT: v_mov_b32_e32 v21, v0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v0 +; GFX90A-NEXT: v_mov_b32_e32 v23, v0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v0 +; GFX90A-NEXT: v_mov_b32_e32 v25, v0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v0 +; GFX90A-NEXT: v_mov_b32_e32 v27, v0 +; GFX90A-NEXT: v_mov_b32_e32 v28, v0 +; GFX90A-NEXT: v_mov_b32_e32 v29, v0 +; GFX90A-NEXT: v_mov_b32_e32 v30, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 12 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_agpr_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v33, 2.0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v0 +; GFX942-NEXT: v_mov_b32_e32 v18, v0 +; GFX942-NEXT: v_mov_b32_e32 v19, v0 +; GFX942-NEXT: v_mov_b32_e32 v20, v0 +; GFX942-NEXT: v_mov_b32_e32 v21, v0 +; GFX942-NEXT: v_mov_b32_e32 v22, v0 +; GFX942-NEXT: v_mov_b32_e32 v23, v0 +; GFX942-NEXT: v_mov_b32_e32 v24, v0 +; GFX942-NEXT: v_mov_b32_e32 v25, v0 +; GFX942-NEXT: v_mov_b32_e32 v26, v0 +; GFX942-NEXT: v_mov_b32_e32 v27, v0 +; GFX942-NEXT: v_mov_b32_e32 v28, v0 +; GFX942-NEXT: v_mov_b32_e32 v29, v0 +; GFX942-NEXT: v_mov_b32_e32 v30, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, v0 ; GFX942-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 11 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) @@ -2366,41 +2292,41 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; ; GFX90A-LABEL: test_mfma_nested_loop_zeroinit: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s0, 0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v0 +; GFX90A-NEXT: v_mov_b32_e32 v19, v0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v0 +; GFX90A-NEXT: v_mov_b32_e32 v21, v0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v0 +; GFX90A-NEXT: v_mov_b32_e32 v23, v0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v0 +; GFX90A-NEXT: v_mov_b32_e32 v25, v0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v0 +; GFX90A-NEXT: v_mov_b32_e32 v27, v0 +; GFX90A-NEXT: v_mov_b32_e32 v28, v0 +; GFX90A-NEXT: v_mov_b32_e32 v29, v0 +; GFX90A-NEXT: v_mov_b32_e32 v30, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 ; GFX90A-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2 @@ -2409,7 +2335,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX90A-NEXT: ; Parent Loop BB9_1 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31] ; GFX90A-NEXT: s_add_i32 s1, s1, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s1, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB9_2 @@ -2420,56 +2346,56 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.4: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 9 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_nested_loop_zeroinit: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_mov_b32 s0, 0 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v0 +; GFX942-NEXT: v_mov_b32_e32 v18, v0 +; GFX942-NEXT: v_mov_b32_e32 v19, v0 +; GFX942-NEXT: v_mov_b32_e32 v20, v0 +; GFX942-NEXT: v_mov_b32_e32 v21, v0 +; GFX942-NEXT: v_mov_b32_e32 v22, v0 +; GFX942-NEXT: v_mov_b32_e32 v23, v0 +; GFX942-NEXT: v_mov_b32_e32 v24, v0 +; GFX942-NEXT: v_mov_b32_e32 v25, v0 +; GFX942-NEXT: v_mov_b32_e32 v26, v0 +; GFX942-NEXT: v_mov_b32_e32 v27, v0 +; GFX942-NEXT: v_mov_b32_e32 v28, v0 +; GFX942-NEXT: v_mov_b32_e32 v29, v0 +; GFX942-NEXT: v_mov_b32_e32 v30, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, v0 +; GFX942-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 ; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB9_2 Depth 2 @@ -2478,7 +2404,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX942-NEXT: ; Parent Loop BB9_1 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31] ; GFX942-NEXT: s_add_i32 s1, s1, -1 ; GFX942-NEXT: s_cmp_lg_u32 s1, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB9_2 @@ -2489,17 +2415,17 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX942-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX942-NEXT: ; %bb.4: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 8 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: br label %for.cond.preheader @@ -2613,163 +2539,97 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 { ; GFX90A-LABEL: test_mfma_loop_zeroinit_ret_use: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s4, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v0 +; GFX90A-NEXT: v_mov_b32_e32 v19, v0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v0 +; GFX90A-NEXT: v_mov_b32_e32 v21, v0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v0 +; GFX90A-NEXT: v_mov_b32_e32 v23, v0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v0 +; GFX90A-NEXT: v_mov_b32_e32 v25, v0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v0 +; GFX90A-NEXT: v_mov_b32_e32 v27, v0 +; GFX90A-NEXT: v_mov_b32_e32 v28, v0 +; GFX90A-NEXT: v_mov_b32_e32 v29, v0 +; GFX90A-NEXT: v_mov_b32_e32 v30, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: .LBB10_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31] ; GFX90A-NEXT: s_add_i32 s4, s4, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s4, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %exit -; GFX90A-NEXT: s_nop 15 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: test_mfma_loop_zeroinit_ret_use: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v32, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v0 +; GFX942-NEXT: v_mov_b32_e32 v18, v0 +; GFX942-NEXT: v_mov_b32_e32 v19, v0 +; GFX942-NEXT: v_mov_b32_e32 v20, v0 +; GFX942-NEXT: v_mov_b32_e32 v21, v0 +; GFX942-NEXT: v_mov_b32_e32 v22, v0 +; GFX942-NEXT: v_mov_b32_e32 v23, v0 +; GFX942-NEXT: v_mov_b32_e32 v24, v0 +; GFX942-NEXT: v_mov_b32_e32 v25, v0 +; GFX942-NEXT: v_mov_b32_e32 v26, v0 +; GFX942-NEXT: v_mov_b32_e32 v27, v0 +; GFX942-NEXT: v_mov_b32_e32 v28, v0 +; GFX942-NEXT: v_mov_b32_e32 v29, v0 +; GFX942-NEXT: v_mov_b32_e32 v30, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, v0 ; GFX942-NEXT: .LBB10_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %exit -; GFX942-NEXT: s_nop 14 -; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_setpc_b64 s[30:31] entry: br label %for.cond.preheader @@ -2871,163 +2731,97 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s4, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v33, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v32 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v0 +; GFX90A-NEXT: v_mov_b32_e32 v19, v0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v0 +; GFX90A-NEXT: v_mov_b32_e32 v21, v0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v0 +; GFX90A-NEXT: v_mov_b32_e32 v23, v0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v0 +; GFX90A-NEXT: v_mov_b32_e32 v25, v0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v0 +; GFX90A-NEXT: v_mov_b32_e32 v27, v0 +; GFX90A-NEXT: v_mov_b32_e32 v28, v0 +; GFX90A-NEXT: v_mov_b32_e32 v29, v0 +; GFX90A-NEXT: v_mov_b32_e32 v30, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31] ; GFX90A-NEXT: s_add_i32 s4, s4, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s4, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %exit -; GFX90A-NEXT: s_nop 15 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v33, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, v32 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v0 +; GFX942-NEXT: v_mov_b32_e32 v18, v0 +; GFX942-NEXT: v_mov_b32_e32 v19, v0 +; GFX942-NEXT: v_mov_b32_e32 v20, v0 +; GFX942-NEXT: v_mov_b32_e32 v21, v0 +; GFX942-NEXT: v_mov_b32_e32 v22, v0 +; GFX942-NEXT: v_mov_b32_e32 v23, v0 +; GFX942-NEXT: v_mov_b32_e32 v24, v0 +; GFX942-NEXT: v_mov_b32_e32 v25, v0 +; GFX942-NEXT: v_mov_b32_e32 v26, v0 +; GFX942-NEXT: v_mov_b32_e32 v27, v0 +; GFX942-NEXT: v_mov_b32_e32 v28, v0 +; GFX942-NEXT: v_mov_b32_e32 v29, v0 +; GFX942-NEXT: v_mov_b32_e32 v30, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, v0 ; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %exit -; GFX942-NEXT: s_nop 14 -; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_setpc_b64 s[30:31] entry: br label %for.cond.preheader diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index 51cd564..323514b 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -219,397 +219,349 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-LABEL: test_mfma_f32_32x32x1f32: ; GREEDY90A: ; %bb.0: ; %bb ; GREEDY90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY90A-NEXT: v_mov_b32_e32 v64, 1.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v65, 2.0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, s16 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, s17 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s18 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s19 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a4, s20 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a5, s21 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a6, s22 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a7, s23 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a8, s24 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a9, s25 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a10, s26 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a11, s27 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a12, s28 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a13, s29 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a14, s30 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a15, s31 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a16, s0 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a17, s1 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s2 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s3 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s4 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s5 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s6 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s7 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s8 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s9 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s10 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s11 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s12 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s13 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s14 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s15 +; GREEDY90A-NEXT: v_mov_b32_e32 v0, s16 +; GREEDY90A-NEXT: v_mov_b32_e32 v1, s17 +; GREEDY90A-NEXT: v_mov_b32_e32 v2, s18 +; GREEDY90A-NEXT: v_mov_b32_e32 v3, s19 +; GREEDY90A-NEXT: v_mov_b32_e32 v4, s20 +; GREEDY90A-NEXT: v_mov_b32_e32 v5, s21 +; GREEDY90A-NEXT: v_mov_b32_e32 v6, s22 +; GREEDY90A-NEXT: v_mov_b32_e32 v7, s23 +; GREEDY90A-NEXT: v_mov_b32_e32 v8, s24 +; GREEDY90A-NEXT: v_mov_b32_e32 v9, s25 +; GREEDY90A-NEXT: v_mov_b32_e32 v10, s26 +; GREEDY90A-NEXT: v_mov_b32_e32 v11, s27 +; GREEDY90A-NEXT: v_mov_b32_e32 v12, s28 +; GREEDY90A-NEXT: v_mov_b32_e32 v13, s29 +; GREEDY90A-NEXT: v_mov_b32_e32 v14, s30 +; GREEDY90A-NEXT: v_mov_b32_e32 v15, s31 +; GREEDY90A-NEXT: v_mov_b32_e32 v16, s0 +; GREEDY90A-NEXT: v_mov_b32_e32 v17, s1 +; GREEDY90A-NEXT: v_mov_b32_e32 v18, s2 +; GREEDY90A-NEXT: v_mov_b32_e32 v19, s3 +; GREEDY90A-NEXT: v_mov_b32_e32 v20, s4 +; GREEDY90A-NEXT: v_mov_b32_e32 v21, s5 +; GREEDY90A-NEXT: v_mov_b32_e32 v22, s6 +; GREEDY90A-NEXT: v_mov_b32_e32 v23, s7 +; GREEDY90A-NEXT: v_mov_b32_e32 v24, s8 +; GREEDY90A-NEXT: v_mov_b32_e32 v25, s9 +; GREEDY90A-NEXT: v_mov_b32_e32 v26, s10 +; GREEDY90A-NEXT: v_mov_b32_e32 v27, s11 +; GREEDY90A-NEXT: v_mov_b32_e32 v28, s12 +; GREEDY90A-NEXT: v_mov_b32_e32 v29, s13 +; GREEDY90A-NEXT: v_mov_b32_e32 v30, s14 +; GREEDY90A-NEXT: v_mov_b32_e32 v31, s15 ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31] +; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v64, v65, v[0:31] +; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 v[32:63], v64, v65, v[0:31] ; GREEDY90A-NEXT: s_nop 15 ; GREEDY90A-NEXT: s_nop 2 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a32 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a33 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a4, a34 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a5, a35 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a6, a36 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a7, a37 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a8, a38 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a9, a39 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a10, a40 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a11, a41 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a12, a42 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a13, a43 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a14, a44 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a15, a45 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a16, a46 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a17, a47 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a18, a48 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a19, a49 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a20, a50 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a21, a51 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a22, a52 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a23, a53 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a24, a54 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a25, a55 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a26, a56 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a27, a57 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a28, a58 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a29, a59 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a30, a60 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a31, a61 -; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GREEDY90A-NEXT: v_mov_b32_e32 v2, v32 +; GREEDY90A-NEXT: v_mov_b32_e32 v3, v33 +; GREEDY90A-NEXT: v_mov_b32_e32 v4, v34 +; GREEDY90A-NEXT: v_mov_b32_e32 v5, v35 +; GREEDY90A-NEXT: v_mov_b32_e32 v6, v36 +; GREEDY90A-NEXT: v_mov_b32_e32 v7, v37 +; GREEDY90A-NEXT: v_mov_b32_e32 v8, v38 +; GREEDY90A-NEXT: v_mov_b32_e32 v9, v39 +; GREEDY90A-NEXT: v_mov_b32_e32 v10, v40 +; GREEDY90A-NEXT: v_mov_b32_e32 v11, v41 +; GREEDY90A-NEXT: v_mov_b32_e32 v12, v42 +; GREEDY90A-NEXT: v_mov_b32_e32 v13, v43 +; GREEDY90A-NEXT: v_mov_b32_e32 v14, v44 +; GREEDY90A-NEXT: v_mov_b32_e32 v15, v45 +; GREEDY90A-NEXT: v_mov_b32_e32 v16, v46 +; GREEDY90A-NEXT: v_mov_b32_e32 v17, v47 +; GREEDY90A-NEXT: v_mov_b32_e32 v18, v48 +; GREEDY90A-NEXT: v_mov_b32_e32 v19, v49 +; GREEDY90A-NEXT: v_mov_b32_e32 v20, v50 +; GREEDY90A-NEXT: v_mov_b32_e32 v21, v51 +; GREEDY90A-NEXT: v_mov_b32_e32 v22, v52 +; GREEDY90A-NEXT: v_mov_b32_e32 v23, v53 +; GREEDY90A-NEXT: v_mov_b32_e32 v24, v54 +; GREEDY90A-NEXT: v_mov_b32_e32 v25, v55 +; GREEDY90A-NEXT: v_mov_b32_e32 v26, v56 +; GREEDY90A-NEXT: v_mov_b32_e32 v27, v57 +; GREEDY90A-NEXT: v_mov_b32_e32 v28, v58 +; GREEDY90A-NEXT: v_mov_b32_e32 v29, v59 +; GREEDY90A-NEXT: v_mov_b32_e32 v30, v60 +; GREEDY90A-NEXT: v_mov_b32_e32 v31, v61 +; GREEDY90A-NEXT: v_mov_b32_e32 v32, 0 +; GREEDY90A-NEXT: s_nop 0 +; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v64, v65, v[0:31] ; GREEDY90A-NEXT: s_nop 15 ; GREEDY90A-NEXT: s_nop 2 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[16:19], s[34:35] offset:64 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[20:23], s[34:35] offset:80 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[34:35] offset:32 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[34:35] offset:48 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[34:35] -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[34:35] offset:16 +; GREEDY90A-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GREEDY90A-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; GREEDY90A-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GREEDY90A-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GREEDY90A-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GREEDY90A-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GREEDY90A-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GREEDY90A-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 ; GREEDY90A-NEXT: s_endpgm ; ; GREEDY942-LABEL: test_mfma_f32_32x32x1f32: ; GREEDY942: ; %bb.0: ; %bb ; GREEDY942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY942-NEXT: v_mov_b32_e32 v64, 1.0 +; GREEDY942-NEXT: v_mov_b32_e32 v65, 2.0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY942-NEXT: v_accvgpr_write_b32 a0, s16 -; GREEDY942-NEXT: v_accvgpr_write_b32 a1, s17 -; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s18 -; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s19 -; GREEDY942-NEXT: v_accvgpr_write_b32 a4, s20 -; GREEDY942-NEXT: v_accvgpr_write_b32 a5, s21 -; GREEDY942-NEXT: v_accvgpr_write_b32 a6, s22 -; GREEDY942-NEXT: v_accvgpr_write_b32 a7, s23 -; GREEDY942-NEXT: v_accvgpr_write_b32 a8, s24 -; GREEDY942-NEXT: v_accvgpr_write_b32 a9, s25 -; GREEDY942-NEXT: v_accvgpr_write_b32 a10, s26 -; GREEDY942-NEXT: v_accvgpr_write_b32 a11, s27 -; GREEDY942-NEXT: v_accvgpr_write_b32 a12, s28 -; GREEDY942-NEXT: v_accvgpr_write_b32 a13, s29 -; GREEDY942-NEXT: v_accvgpr_write_b32 a14, s30 -; GREEDY942-NEXT: v_accvgpr_write_b32 a15, s31 -; GREEDY942-NEXT: v_accvgpr_write_b32 a16, s0 -; GREEDY942-NEXT: v_accvgpr_write_b32 a17, s1 -; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s2 -; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s3 -; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s4 -; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s5 -; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s6 -; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s7 -; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s8 -; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s9 -; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s10 -; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s11 -; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s12 -; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s13 -; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s14 -; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s15 +; GREEDY942-NEXT: v_mov_b32_e32 v0, s16 +; GREEDY942-NEXT: v_mov_b32_e32 v1, s17 +; GREEDY942-NEXT: v_mov_b32_e32 v2, s18 +; GREEDY942-NEXT: v_mov_b32_e32 v3, s19 +; GREEDY942-NEXT: v_mov_b32_e32 v4, s20 +; GREEDY942-NEXT: v_mov_b32_e32 v5, s21 +; GREEDY942-NEXT: v_mov_b32_e32 v6, s22 +; GREEDY942-NEXT: v_mov_b32_e32 v7, s23 +; GREEDY942-NEXT: v_mov_b32_e32 v8, s24 +; GREEDY942-NEXT: v_mov_b32_e32 v9, s25 +; GREEDY942-NEXT: v_mov_b32_e32 v10, s26 +; GREEDY942-NEXT: v_mov_b32_e32 v11, s27 +; GREEDY942-NEXT: v_mov_b32_e32 v12, s28 +; GREEDY942-NEXT: v_mov_b32_e32 v13, s29 +; GREEDY942-NEXT: v_mov_b32_e32 v14, s30 +; GREEDY942-NEXT: v_mov_b32_e32 v15, s31 +; GREEDY942-NEXT: v_mov_b32_e32 v16, s0 +; GREEDY942-NEXT: v_mov_b32_e32 v17, s1 +; GREEDY942-NEXT: v_mov_b32_e32 v18, s2 +; GREEDY942-NEXT: v_mov_b32_e32 v19, s3 +; GREEDY942-NEXT: v_mov_b32_e32 v20, s4 +; GREEDY942-NEXT: v_mov_b32_e32 v21, s5 +; GREEDY942-NEXT: v_mov_b32_e32 v22, s6 +; GREEDY942-NEXT: v_mov_b32_e32 v23, s7 +; GREEDY942-NEXT: v_mov_b32_e32 v24, s8 +; GREEDY942-NEXT: v_mov_b32_e32 v25, s9 +; GREEDY942-NEXT: v_mov_b32_e32 v26, s10 +; GREEDY942-NEXT: v_mov_b32_e32 v27, s11 +; GREEDY942-NEXT: v_mov_b32_e32 v28, s12 +; GREEDY942-NEXT: v_mov_b32_e32 v29, s13 +; GREEDY942-NEXT: v_mov_b32_e32 v30, s14 +; GREEDY942-NEXT: v_mov_b32_e32 v31, s15 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] -; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[32:63], v0, v1, a[0:31] +; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v64, v65, v[0:31] +; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], v64, v65, v[0:31] ; GREEDY942-NEXT: s_nop 15 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a32 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a33 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a4, a34 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a5, a35 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a6, a36 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a7, a37 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a8, a38 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a9, a39 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a10, a40 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a11, a41 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a12, a42 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a13, a43 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a14, a44 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a15, a45 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a16, a46 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a17, a47 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a18, a48 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a19, a49 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a20, a50 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a21, a51 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a22, a52 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a23, a53 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a24, a54 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a25, a55 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a26, a56 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a27, a57 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a28, a58 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a29, a59 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a30, a60 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a31, a61 -; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GREEDY942-NEXT: v_mov_b32_e32 v2, v32 +; GREEDY942-NEXT: v_mov_b32_e32 v3, v33 +; GREEDY942-NEXT: v_mov_b32_e32 v4, v34 +; GREEDY942-NEXT: v_mov_b32_e32 v5, v35 +; GREEDY942-NEXT: v_mov_b32_e32 v6, v36 +; GREEDY942-NEXT: v_mov_b32_e32 v7, v37 +; GREEDY942-NEXT: v_mov_b32_e32 v8, v38 +; GREEDY942-NEXT: v_mov_b32_e32 v9, v39 +; GREEDY942-NEXT: v_mov_b32_e32 v10, v40 +; GREEDY942-NEXT: v_mov_b32_e32 v11, v41 +; GREEDY942-NEXT: v_mov_b32_e32 v12, v42 +; GREEDY942-NEXT: v_mov_b32_e32 v13, v43 +; GREEDY942-NEXT: v_mov_b32_e32 v14, v44 +; GREEDY942-NEXT: v_mov_b32_e32 v15, v45 +; GREEDY942-NEXT: v_mov_b32_e32 v16, v46 +; GREEDY942-NEXT: v_mov_b32_e32 v17, v47 +; GREEDY942-NEXT: v_mov_b32_e32 v18, v48 +; GREEDY942-NEXT: v_mov_b32_e32 v19, v49 +; GREEDY942-NEXT: v_mov_b32_e32 v20, v50 +; GREEDY942-NEXT: v_mov_b32_e32 v21, v51 +; GREEDY942-NEXT: v_mov_b32_e32 v22, v52 +; GREEDY942-NEXT: v_mov_b32_e32 v23, v53 +; GREEDY942-NEXT: v_mov_b32_e32 v24, v54 +; GREEDY942-NEXT: v_mov_b32_e32 v25, v55 +; GREEDY942-NEXT: v_mov_b32_e32 v26, v56 +; GREEDY942-NEXT: v_mov_b32_e32 v27, v57 +; GREEDY942-NEXT: v_mov_b32_e32 v28, v58 +; GREEDY942-NEXT: v_mov_b32_e32 v29, v59 +; GREEDY942-NEXT: v_mov_b32_e32 v30, v60 +; GREEDY942-NEXT: v_mov_b32_e32 v31, v61 +; GREEDY942-NEXT: v_mov_b32_e32 v32, 0 +; GREEDY942-NEXT: s_nop 0 +; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v64, v65, v[0:31] ; GREEDY942-NEXT: s_nop 15 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[16:19], s[34:35] offset:64 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[20:23], s[34:35] offset:80 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[34:35] offset:32 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[34:35] offset:48 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[34:35] -; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[34:35] offset:16 +; GREEDY942-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GREEDY942-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; GREEDY942-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GREEDY942-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GREEDY942-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GREEDY942-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GREEDY942-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GREEDY942-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 ; GREEDY942-NEXT: s_endpgm ; ; GREEDY90A-GISEL-LABEL: test_mfma_f32_32x32x1f32: ; GREEDY90A-GISEL: ; %bb.0: ; %bb ; GREEDY90A-GISEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v64, 1.0 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v65, 2.0 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 ; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a16, s16 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a17, s17 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a18, s18 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a19, s19 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a20, s20 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a21, s21 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a22, s22 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a23, s23 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a24, s24 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a25, s25 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a26, s26 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a27, s27 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a28, s28 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a29, s29 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a30, s30 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a31, s31 +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[16:17], s[16:17], s[16:17] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[20:21], s[20:21], s[20:21] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[22:23], s[22:23], s[22:23] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[24:25], s[24:25], s[24:25] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[26:27], s[26:27], s[26:27] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[28:29], s[28:29], s[28:29] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[30:31], s[30:31], s[30:31] op_sel:[0,1] ; GREEDY90A-GISEL-NEXT: s_nop 1 -; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31] +; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v64, v65, v[0:31] +; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 v[32:63], v64, v65, v[0:31] ; GREEDY90A-GISEL-NEXT: s_nop 15 ; GREEDY90A-GISEL-NEXT: s_nop 2 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a32 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a33 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a4, a34 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a5, a35 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a6, a36 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a7, a37 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a8, a38 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a9, a39 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a10, a40 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a11, a41 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a12, a42 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a13, a43 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a14, a44 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a15, a45 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a16, a46 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a17, a47 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a18, a48 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a19, a49 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a20, a50 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a21, a51 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a22, a52 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a23, a53 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a24, a54 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a25, a55 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a26, a56 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a27, a57 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a28, a58 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a29, a59 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a30, a60 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a31, a61 -; GREEDY90A-GISEL-NEXT: s_nop 1 -; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v2, v32 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v3, v33 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v4, v34 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v5, v35 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v6, v36 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v7, v37 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v8, v38 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v9, v39 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v10, v40 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v11, v41 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v12, v42 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v13, v43 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v14, v44 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v15, v45 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v16, v46 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v17, v47 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v18, v48 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v19, v49 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v20, v50 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v21, v51 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v22, v52 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v23, v53 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v24, v54 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v25, v55 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v26, v56 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v27, v57 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v28, v58 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v29, v59 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v30, v60 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v31, v61 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v32, 0 +; GREEDY90A-GISEL-NEXT: s_nop 0 +; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v64, v65, v[0:31] ; GREEDY90A-GISEL-NEXT: s_nop 15 -; GREEDY90A-GISEL-NEXT: s_nop 1 -; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 -; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GREEDY90A-GISEL-NEXT: s_nop 2 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 ; GREEDY90A-GISEL-NEXT: s_endpgm ; ; FAST90A-LABEL: test_mfma_f32_32x32x1f32: ; FAST90A: ; %bb.0: ; %bb ; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0 -; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0 +; FAST90A-NEXT: v_mov_b32_e32 v34, 2.0 ; FAST90A-NEXT: v_mov_b32_e32 v0, 0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) ; FAST90A-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 ; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) -; FAST90A-NEXT: v_accvgpr_write_b32 a32, s36 -; FAST90A-NEXT: v_accvgpr_write_b32 a33, s37 -; FAST90A-NEXT: v_accvgpr_write_b32 a34, s38 -; FAST90A-NEXT: v_accvgpr_write_b32 a35, s39 -; FAST90A-NEXT: v_accvgpr_write_b32 a36, s40 -; FAST90A-NEXT: v_accvgpr_write_b32 a37, s41 -; FAST90A-NEXT: v_accvgpr_write_b32 a38, s42 -; FAST90A-NEXT: v_accvgpr_write_b32 a39, s43 -; FAST90A-NEXT: v_accvgpr_write_b32 a40, s44 -; FAST90A-NEXT: v_accvgpr_write_b32 a41, s45 -; FAST90A-NEXT: v_accvgpr_write_b32 a42, s46 -; FAST90A-NEXT: v_accvgpr_write_b32 a43, s47 -; FAST90A-NEXT: v_accvgpr_write_b32 a44, s48 -; FAST90A-NEXT: v_accvgpr_write_b32 a45, s49 -; FAST90A-NEXT: v_accvgpr_write_b32 a46, s50 -; FAST90A-NEXT: v_accvgpr_write_b32 a47, s51 -; FAST90A-NEXT: v_accvgpr_write_b32 a48, s4 -; FAST90A-NEXT: v_accvgpr_write_b32 a49, s5 -; FAST90A-NEXT: v_accvgpr_write_b32 a50, s6 -; FAST90A-NEXT: v_accvgpr_write_b32 a51, s7 -; FAST90A-NEXT: v_accvgpr_write_b32 a52, s8 -; FAST90A-NEXT: v_accvgpr_write_b32 a53, s9 -; FAST90A-NEXT: v_accvgpr_write_b32 a54, s10 -; FAST90A-NEXT: v_accvgpr_write_b32 a55, s11 -; FAST90A-NEXT: v_accvgpr_write_b32 a56, s12 -; FAST90A-NEXT: v_accvgpr_write_b32 a57, s13 -; FAST90A-NEXT: v_accvgpr_write_b32 a58, s14 -; FAST90A-NEXT: v_accvgpr_write_b32 a59, s15 -; FAST90A-NEXT: v_accvgpr_write_b32 a60, s16 -; FAST90A-NEXT: v_accvgpr_write_b32 a61, s17 -; FAST90A-NEXT: v_accvgpr_write_b32 a62, s18 -; FAST90A-NEXT: v_accvgpr_write_b32 a63, s19 +; FAST90A-NEXT: v_mov_b32_e32 v2, s36 +; FAST90A-NEXT: v_mov_b32_e32 v3, s37 +; FAST90A-NEXT: v_mov_b32_e32 v4, s38 +; FAST90A-NEXT: v_mov_b32_e32 v5, s39 +; FAST90A-NEXT: v_mov_b32_e32 v6, s40 +; FAST90A-NEXT: v_mov_b32_e32 v7, s41 +; FAST90A-NEXT: v_mov_b32_e32 v8, s42 +; FAST90A-NEXT: v_mov_b32_e32 v9, s43 +; FAST90A-NEXT: v_mov_b32_e32 v10, s44 +; FAST90A-NEXT: v_mov_b32_e32 v11, s45 +; FAST90A-NEXT: v_mov_b32_e32 v12, s46 +; FAST90A-NEXT: v_mov_b32_e32 v13, s47 +; FAST90A-NEXT: v_mov_b32_e32 v14, s48 +; FAST90A-NEXT: v_mov_b32_e32 v15, s49 +; FAST90A-NEXT: v_mov_b32_e32 v16, s50 +; FAST90A-NEXT: v_mov_b32_e32 v17, s51 +; FAST90A-NEXT: v_mov_b32_e32 v18, s4 +; FAST90A-NEXT: v_mov_b32_e32 v19, s5 +; FAST90A-NEXT: v_mov_b32_e32 v20, s6 +; FAST90A-NEXT: v_mov_b32_e32 v21, s7 +; FAST90A-NEXT: v_mov_b32_e32 v22, s8 +; FAST90A-NEXT: v_mov_b32_e32 v23, s9 +; FAST90A-NEXT: v_mov_b32_e32 v24, s10 +; FAST90A-NEXT: v_mov_b32_e32 v25, s11 +; FAST90A-NEXT: v_mov_b32_e32 v26, s12 +; FAST90A-NEXT: v_mov_b32_e32 v27, s13 +; FAST90A-NEXT: v_mov_b32_e32 v28, s14 +; FAST90A-NEXT: v_mov_b32_e32 v29, s15 +; FAST90A-NEXT: v_mov_b32_e32 v30, s16 +; FAST90A-NEXT: v_mov_b32_e32 v31, s17 +; FAST90A-NEXT: v_mov_b32_e32 v32, s18 +; FAST90A-NEXT: v_mov_b32_e32 v33, s19 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63] +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v1, v34, v[2:33] +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v1, v34, v[2:33] ; FAST90A-NEXT: s_nop 15 ; FAST90A-NEXT: s_nop 2 -; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29 -; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28 -; FAST90A-NEXT: v_accvgpr_read_b32 v5, a27 -; FAST90A-NEXT: v_accvgpr_read_b32 v6, a26 -; FAST90A-NEXT: v_accvgpr_read_b32 v7, a25 -; FAST90A-NEXT: v_accvgpr_read_b32 v8, a24 -; FAST90A-NEXT: v_accvgpr_read_b32 v9, a23 -; FAST90A-NEXT: v_accvgpr_read_b32 v10, a22 -; FAST90A-NEXT: v_accvgpr_read_b32 v11, a21 -; FAST90A-NEXT: v_accvgpr_read_b32 v12, a20 -; FAST90A-NEXT: v_accvgpr_read_b32 v13, a19 -; FAST90A-NEXT: v_accvgpr_read_b32 v14, a18 -; FAST90A-NEXT: v_accvgpr_read_b32 v15, a17 -; FAST90A-NEXT: v_accvgpr_read_b32 v16, a16 -; FAST90A-NEXT: v_accvgpr_read_b32 v17, a15 -; FAST90A-NEXT: v_accvgpr_read_b32 v18, a14 -; FAST90A-NEXT: v_accvgpr_read_b32 v19, a13 -; FAST90A-NEXT: v_accvgpr_read_b32 v20, a12 -; FAST90A-NEXT: v_accvgpr_read_b32 v21, a11 -; FAST90A-NEXT: v_accvgpr_read_b32 v22, a10 -; FAST90A-NEXT: v_accvgpr_read_b32 v23, a9 -; FAST90A-NEXT: v_accvgpr_read_b32 v24, a8 -; FAST90A-NEXT: v_accvgpr_read_b32 v25, a7 -; FAST90A-NEXT: v_accvgpr_read_b32 v26, a6 -; FAST90A-NEXT: v_accvgpr_read_b32 v27, a5 -; FAST90A-NEXT: v_accvgpr_read_b32 v28, a4 -; FAST90A-NEXT: v_accvgpr_read_b32 v29, a3 -; FAST90A-NEXT: v_accvgpr_read_b32 v30, a2 -; FAST90A-NEXT: v_accvgpr_read_b32 v31, a1 -; FAST90A-NEXT: v_accvgpr_read_b32 v32, a0 -; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a32 -; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a33 -; FAST90A-NEXT: v_accvgpr_write_b32 a2, v32 -; FAST90A-NEXT: v_accvgpr_write_b32 a3, v31 -; FAST90A-NEXT: v_accvgpr_write_b32 a4, v30 -; FAST90A-NEXT: v_accvgpr_write_b32 a5, v29 -; FAST90A-NEXT: v_accvgpr_write_b32 a6, v28 -; FAST90A-NEXT: v_accvgpr_write_b32 a7, v27 -; FAST90A-NEXT: v_accvgpr_write_b32 a8, v26 -; FAST90A-NEXT: v_accvgpr_write_b32 a9, v25 -; FAST90A-NEXT: v_accvgpr_write_b32 a10, v24 -; FAST90A-NEXT: v_accvgpr_write_b32 a11, v23 -; FAST90A-NEXT: v_accvgpr_write_b32 a12, v22 -; FAST90A-NEXT: v_accvgpr_write_b32 a13, v21 -; FAST90A-NEXT: v_accvgpr_write_b32 a14, v20 -; FAST90A-NEXT: v_accvgpr_write_b32 a15, v19 -; FAST90A-NEXT: v_accvgpr_write_b32 a16, v18 -; FAST90A-NEXT: v_accvgpr_write_b32 a17, v17 -; FAST90A-NEXT: v_accvgpr_write_b32 a18, v16 -; FAST90A-NEXT: v_accvgpr_write_b32 a19, v15 -; FAST90A-NEXT: v_accvgpr_write_b32 a20, v14 -; FAST90A-NEXT: v_accvgpr_write_b32 a21, v13 -; FAST90A-NEXT: v_accvgpr_write_b32 a22, v12 -; FAST90A-NEXT: v_accvgpr_write_b32 a23, v11 -; FAST90A-NEXT: v_accvgpr_write_b32 a24, v10 -; FAST90A-NEXT: v_accvgpr_write_b32 a25, v9 -; FAST90A-NEXT: v_accvgpr_write_b32 a26, v8 -; FAST90A-NEXT: v_accvgpr_write_b32 a27, v7 -; FAST90A-NEXT: v_accvgpr_write_b32 a28, v6 -; FAST90A-NEXT: v_accvgpr_write_b32 a29, v5 -; FAST90A-NEXT: v_accvgpr_write_b32 a30, v4 -; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3 +; FAST90A-NEXT: v_mov_b32_e32 v4, v36 +; FAST90A-NEXT: v_mov_b32_e32 v5, v37 +; FAST90A-NEXT: v_mov_b32_e32 v6, v38 +; FAST90A-NEXT: v_mov_b32_e32 v7, v39 +; FAST90A-NEXT: v_mov_b32_e32 v8, v40 +; FAST90A-NEXT: v_mov_b32_e32 v9, v41 +; FAST90A-NEXT: v_mov_b32_e32 v10, v42 +; FAST90A-NEXT: v_mov_b32_e32 v11, v43 +; FAST90A-NEXT: v_mov_b32_e32 v12, v44 +; FAST90A-NEXT: v_mov_b32_e32 v13, v45 +; FAST90A-NEXT: v_mov_b32_e32 v14, v46 +; FAST90A-NEXT: v_mov_b32_e32 v15, v47 +; FAST90A-NEXT: v_mov_b32_e32 v16, v48 +; FAST90A-NEXT: v_mov_b32_e32 v17, v49 +; FAST90A-NEXT: v_mov_b32_e32 v18, v50 +; FAST90A-NEXT: v_mov_b32_e32 v19, v51 +; FAST90A-NEXT: v_mov_b32_e32 v20, v52 +; FAST90A-NEXT: v_mov_b32_e32 v21, v53 +; FAST90A-NEXT: v_mov_b32_e32 v22, v54 +; FAST90A-NEXT: v_mov_b32_e32 v23, v55 +; FAST90A-NEXT: v_mov_b32_e32 v24, v56 +; FAST90A-NEXT: v_mov_b32_e32 v25, v57 +; FAST90A-NEXT: v_mov_b32_e32 v26, v58 +; FAST90A-NEXT: v_mov_b32_e32 v27, v59 +; FAST90A-NEXT: v_mov_b32_e32 v28, v60 +; FAST90A-NEXT: v_mov_b32_e32 v29, v61 +; FAST90A-NEXT: v_mov_b32_e32 v30, v62 +; FAST90A-NEXT: v_mov_b32_e32 v31, v63 +; FAST90A-NEXT: v_mov_b32_e32 v32, v64 +; FAST90A-NEXT: v_mov_b32_e32 v33, v65 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; FAST90A-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v1, v34, v[2:33] ; FAST90A-NEXT: s_nop 15 ; FAST90A-NEXT: s_nop 2 -; FAST90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; FAST90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; FAST90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; FAST90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; FAST90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; FAST90A-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96 +; FAST90A-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112 +; FAST90A-NEXT: global_store_dwordx4 v0, v[18:21], s[0:1] offset:64 +; FAST90A-NEXT: global_store_dwordx4 v0, v[22:25], s[0:1] offset:80 +; FAST90A-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32 +; FAST90A-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48 +; FAST90A-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; FAST90A-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 ; FAST90A-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -707,185 +659,177 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY90A: ; %bb.0: ; %bb ; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY90A-NEXT: v_mov_b32_e32 v32, 1.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v33, 2.0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY90A-NEXT: v_accvgpr_write_b32 a33, s15 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a32, s14 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s13 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s12 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s11 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s10 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s9 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s8 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s7 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s6 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s5 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s4 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s3 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s2 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0 -; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] -; GREEDY90A-NEXT: s_nop 9 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18 -; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19 +; GREEDY90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GREEDY90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GREEDY90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GREEDY90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GREEDY90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GREEDY90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GREEDY90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GREEDY90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v32, v33, v[0:15] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 v[16:31], v32, v33, v[0:15] +; GREEDY90A-NEXT: s_nop 10 +; GREEDY90A-NEXT: v_mov_b32_e32 v2, v16 +; GREEDY90A-NEXT: v_mov_b32_e32 v3, v17 +; GREEDY90A-NEXT: v_mov_b32_e32 v4, v18 +; GREEDY90A-NEXT: v_mov_b32_e32 v5, v19 +; GREEDY90A-NEXT: v_mov_b32_e32 v6, v20 +; GREEDY90A-NEXT: v_mov_b32_e32 v7, v21 +; GREEDY90A-NEXT: v_mov_b32_e32 v8, v22 +; GREEDY90A-NEXT: v_mov_b32_e32 v9, v23 +; GREEDY90A-NEXT: v_mov_b32_e32 v10, v24 +; GREEDY90A-NEXT: v_mov_b32_e32 v11, v25 +; GREEDY90A-NEXT: v_mov_b32_e32 v12, v26 +; GREEDY90A-NEXT: v_mov_b32_e32 v13, v27 +; GREEDY90A-NEXT: v_mov_b32_e32 v14, v28 +; GREEDY90A-NEXT: v_mov_b32_e32 v15, v29 +; GREEDY90A-NEXT: v_mov_b32_e32 v16, 0 +; GREEDY90A-NEXT: s_nop 0 +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v32, v33, v[0:15] ; GREEDY90A-NEXT: s_nop 10 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] +; GREEDY90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GREEDY90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GREEDY90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GREEDY90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GREEDY90A-NEXT: s_endpgm ; ; GREEDY942-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY942: ; %bb.0: ; %bb ; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY942-NEXT: v_mov_b32_e32 v32, 1.0 +; GREEDY942-NEXT: v_mov_b32_e32 v33, 2.0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY942-NEXT: v_accvgpr_write_b32 a33, s15 -; GREEDY942-NEXT: v_accvgpr_write_b32 a32, s14 -; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s13 -; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s12 -; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s11 -; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s10 -; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s9 -; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s8 -; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s7 -; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s6 -; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s5 -; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s4 -; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s3 -; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s2 -; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1 -; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0 +; GREEDY942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GREEDY942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GREEDY942-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GREEDY942-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GREEDY942-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GREEDY942-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GREEDY942-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GREEDY942-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33] -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33] -; GREEDY942-NEXT: s_nop 8 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18 -; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19 -; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v32, v33, v[0:15] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[16:31], v32, v33, v[0:15] +; GREEDY942-NEXT: s_nop 9 +; GREEDY942-NEXT: v_mov_b32_e32 v2, v16 +; GREEDY942-NEXT: v_mov_b32_e32 v3, v17 +; GREEDY942-NEXT: v_mov_b32_e32 v4, v18 +; GREEDY942-NEXT: v_mov_b32_e32 v5, v19 +; GREEDY942-NEXT: v_mov_b32_e32 v6, v20 +; GREEDY942-NEXT: v_mov_b32_e32 v7, v21 +; GREEDY942-NEXT: v_mov_b32_e32 v8, v22 +; GREEDY942-NEXT: v_mov_b32_e32 v9, v23 +; GREEDY942-NEXT: v_mov_b32_e32 v10, v24 +; GREEDY942-NEXT: v_mov_b32_e32 v11, v25 +; GREEDY942-NEXT: v_mov_b32_e32 v12, v26 +; GREEDY942-NEXT: v_mov_b32_e32 v13, v27 +; GREEDY942-NEXT: v_mov_b32_e32 v14, v28 +; GREEDY942-NEXT: v_mov_b32_e32 v15, v29 +; GREEDY942-NEXT: v_mov_b32_e32 v16, 0 +; GREEDY942-NEXT: s_nop 0 +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v32, v33, v[0:15] ; GREEDY942-NEXT: s_nop 9 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] +; GREEDY942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GREEDY942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GREEDY942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GREEDY942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GREEDY942-NEXT: s_endpgm ; ; GREEDY90A-GISEL-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY90A-GISEL: ; %bb.0: ; %bb ; GREEDY90A-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v32, 1.0 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v33, 2.0 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GREEDY90A-GISEL-NEXT: s_nop 1 -; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15] +; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v32, v33, v[0:15] +; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 v[16:31], v32, v33, v[0:15] ; GREEDY90A-GISEL-NEXT: s_nop 10 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a16 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a17 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a4, a18 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a5, a19 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a6, a20 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a7, a21 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a8, a22 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a9, a23 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a10, a24 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a11, a25 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a12, a26 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a13, a27 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a14, a28 -; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a15, a29 -; GREEDY90A-GISEL-NEXT: s_nop 1 -; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GREEDY90A-GISEL-NEXT: s_nop 9 -; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v2, v16 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v3, v17 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v4, v18 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v5, v19 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v6, v20 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v7, v21 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v8, v22 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v9, v23 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v10, v24 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v11, v25 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v12, v26 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v13, v27 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v14, v28 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v15, v29 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GREEDY90A-GISEL-NEXT: s_nop 0 +; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v32, v33, v[0:15] +; GREEDY90A-GISEL-NEXT: s_nop 10 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GREEDY90A-GISEL-NEXT: s_endpgm ; ; FAST90A-LABEL: test_mfma_f32_16x16x1f32: ; FAST90A: ; %bb.0: ; %bb ; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0 -; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0 -; FAST90A-NEXT: v_mov_b32_e32 v0, 0 +; FAST90A-NEXT: v_mov_b32_e32 v0, 1.0 +; FAST90A-NEXT: v_mov_b32_e32 v1, 2.0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) ; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) -; FAST90A-NEXT: v_accvgpr_write_b32 a0, s4 -; FAST90A-NEXT: v_accvgpr_write_b32 a1, s5 -; FAST90A-NEXT: v_accvgpr_write_b32 a2, s6 -; FAST90A-NEXT: v_accvgpr_write_b32 a3, s7 -; FAST90A-NEXT: v_accvgpr_write_b32 a4, s8 -; FAST90A-NEXT: v_accvgpr_write_b32 a5, s9 -; FAST90A-NEXT: v_accvgpr_write_b32 a6, s10 -; FAST90A-NEXT: v_accvgpr_write_b32 a7, s11 -; FAST90A-NEXT: v_accvgpr_write_b32 a8, s12 -; FAST90A-NEXT: v_accvgpr_write_b32 a9, s13 -; FAST90A-NEXT: v_accvgpr_write_b32 a10, s14 -; FAST90A-NEXT: v_accvgpr_write_b32 a11, s15 -; FAST90A-NEXT: v_accvgpr_write_b32 a12, s16 -; FAST90A-NEXT: v_accvgpr_write_b32 a13, s17 -; FAST90A-NEXT: v_accvgpr_write_b32 a14, s18 -; FAST90A-NEXT: v_accvgpr_write_b32 a15, s19 +; FAST90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; FAST90A-NEXT: v_pk_mov_b32 v[4:5], s[6:7], s[6:7] op_sel:[0,1] +; FAST90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1] +; FAST90A-NEXT: v_pk_mov_b32 v[8:9], s[10:11], s[10:11] op_sel:[0,1] +; FAST90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] +; FAST90A-NEXT: v_pk_mov_b32 v[12:13], s[14:15], s[14:15] op_sel:[0,1] +; FAST90A-NEXT: v_pk_mov_b32 v[14:15], s[16:17], s[16:17] op_sel:[0,1] +; FAST90A-NEXT: v_pk_mov_b32 v[16:17], s[18:19], s[18:19] op_sel:[0,1] ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15] +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 v[2:17], v0, v1, v[2:17] +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 v[18:33], v0, v1, v[2:17] ; FAST90A-NEXT: s_nop 10 -; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16 -; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17 -; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a18 -; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a19 -; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a20 -; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a21 -; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a22 -; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a23 -; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a24 -; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a25 -; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a26 -; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a27 -; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a28 -; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29 +; FAST90A-NEXT: v_mov_b32_e32 v4, v18 +; FAST90A-NEXT: v_mov_b32_e32 v5, v19 +; FAST90A-NEXT: v_mov_b32_e32 v6, v20 +; FAST90A-NEXT: v_mov_b32_e32 v7, v21 +; FAST90A-NEXT: v_mov_b32_e32 v8, v22 +; FAST90A-NEXT: v_mov_b32_e32 v9, v23 +; FAST90A-NEXT: v_mov_b32_e32 v10, v24 +; FAST90A-NEXT: v_mov_b32_e32 v11, v25 +; FAST90A-NEXT: v_mov_b32_e32 v12, v26 +; FAST90A-NEXT: v_mov_b32_e32 v13, v27 +; FAST90A-NEXT: v_mov_b32_e32 v14, v28 +; FAST90A-NEXT: v_mov_b32_e32 v15, v29 +; FAST90A-NEXT: v_mov_b32_e32 v16, v30 +; FAST90A-NEXT: v_mov_b32_e32 v17, v31 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] -; FAST90A-NEXT: s_nop 10 -; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; FAST90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; FAST90A-NEXT: v_mfma_f32_16x16x1f32 v[2:17], v0, v1, v[2:17] +; FAST90A-NEXT: v_mov_b32_e32 v0, 0 +; FAST90A-NEXT: s_nop 9 +; FAST90A-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48 +; FAST90A-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32 +; FAST90A-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; FAST90A-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; FAST90A-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg @@ -934,68 +878,63 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-LABEL: test_mfma_f32_4x4x1f32: ; GREEDY90A: ; %bb.0: ; %bb ; GREEDY90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY90A-NEXT: v_mov_b32_e32 v6, 1.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v7, 2.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v8, 0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s2 -; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s3 +; GREEDY90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GREEDY90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] -; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3] +; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v6, v7, v[0:3] +; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 v[2:5], v6, v7, v[0:3] ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v6, v7, v[0:3] ; GREEDY90A-NEXT: s_nop 4 -; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] +; GREEDY90A-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7] ; GREEDY90A-NEXT: s_endpgm ; ; GREEDY942-LABEL: test_mfma_f32_4x4x1f32: ; GREEDY942: ; %bb.0: ; %bb ; GREEDY942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 +; GREEDY942-NEXT: v_mov_b32_e32 v6, 1.0 +; GREEDY942-NEXT: v_mov_b32_e32 v7, 2.0 +; GREEDY942-NEXT: v_mov_b32_e32 v8, 0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY942-NEXT: v_accvgpr_write_b32 a0, s0 -; GREEDY942-NEXT: v_accvgpr_write_b32 a1, s1 -; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s2 -; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s3 +; GREEDY942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GREEDY942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] +; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3] ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[2:5], v0, v1, a[0:3] +; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[2:5], v6, v7, v[0:3] ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] +; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3] ; GREEDY942-NEXT: s_nop 3 -; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] +; GREEDY942-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7] ; GREEDY942-NEXT: s_endpgm ; ; GREEDY90A-GISEL-LABEL: test_mfma_f32_4x4x1f32: ; GREEDY90A-GISEL: ; %bb.0: ; %bb ; GREEDY90A-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v6, 1.0 +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v7, 2.0 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GREEDY90A-GISEL-NEXT: s_nop 1 +; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v6, v7, v[0:3] +; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 v[2:5], v6, v7, v[0:3] +; GREEDY90A-GISEL-NEXT: s_nop 1 +; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v6, v7, v[0:3] ; GREEDY90A-GISEL-NEXT: s_nop 1 -; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] -; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3] +; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GREEDY90A-GISEL-NEXT: s_nop 1 -; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] -; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GREEDY90A-GISEL-NEXT: s_nop 3 -; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GREEDY90A-GISEL-NEXT: s_endpgm ; ; FAST90A-LABEL: test_mfma_f32_4x4x1f32: @@ -1007,20 +946,18 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) ; FAST90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) -; FAST90A-NEXT: v_accvgpr_write_b32 a0, s4 -; FAST90A-NEXT: v_accvgpr_write_b32 a1, s5 -; FAST90A-NEXT: v_accvgpr_write_b32 a2, s6 -; FAST90A-NEXT: v_accvgpr_write_b32 a3, s7 +; FAST90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; FAST90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] -; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[4:7], v1, v2, a[0:3] +; FAST90A-NEXT: v_mfma_f32_4x4x1f32 v[4:7], v1, v2, v[4:7] +; FAST90A-NEXT: v_mfma_f32_4x4x1f32 v[8:11], v1, v2, v[4:7] ; FAST90A-NEXT: s_nop 4 -; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a4 -; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a5 +; FAST90A-NEXT: v_mov_b32_e32 v6, v8 +; FAST90A-NEXT: v_mov_b32_e32 v7, v9 ; FAST90A-NEXT: s_nop 1 -; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] +; FAST90A-NEXT: v_mfma_f32_4x4x1f32 v[2:5], v1, v2, v[4:7] ; FAST90A-NEXT: s_nop 4 -; FAST90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; FAST90A-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; FAST90A-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index cf244f0..5adb0cb 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -6,36 +6,36 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_mov_b32 s2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX942-NEXT: s_mov_b32 s3, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: s_mov_b32 s6, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v2 ; GFX942-NEXT: s_branch .LBB0_2 ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX942-NEXT: s_or_b32 s4, s3, 1 -; GFX942-NEXT: s_ashr_i32 s5, s3, 31 ; GFX942-NEXT: s_mov_b32 s3, s2 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX942-NEXT: s_and_b32 s3, s5, s4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: s_or_b32 s4, s6, 1 +; GFX942-NEXT: s_ashr_i32 s3, s6, 31 +; GFX942-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[4:5], v[4:5], v[0:3] +; GFX942-NEXT: s_and_b32 s6, s3, s4 +; GFX942-NEXT: s_nop 5 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_cbranch_execz .LBB0_4 ; GFX942-NEXT: .LBB0_2: ; %bb ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX942-NEXT: s_cbranch_vccz .LBB0_1 ; GFX942-NEXT: ; %bb.3: -; GFX942-NEXT: ; implicit-def: $sgpr3 -; GFX942-NEXT: ; implicit-def: $agpr2 +; GFX942-NEXT: ; implicit-def: $sgpr6 ; GFX942-NEXT: .LBB0_4: ; %common.ret ; GFX942-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 6509d80..efdb7f1 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s ;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX908 %s -;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s -;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX90A %s +;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s +;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX90A %s ; Partial reg copy and spill missed during regalloc handled later at frame lowering. define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index da48af1..02aff39 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN,GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN,GFX90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -check-prefixes=GCN,GFX90A %s define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 { ; GFX908-LABEL: max_12regs_13a_used: diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index b045c76..0a379ba 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -872,15 +872,13 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[14:15], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_i32_16x16x32_i8 v[2:5], v[2:3], v[2:3], v[4:7] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 6 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13] +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[12:13] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -908,66 +906,66 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-LABEL: v8i8_mfma_half: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[36:43], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v1 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[36:37] +; GFX942-NEXT: global_load_dwordx2 v[34:35], v0, s[36:37] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB15_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[38:39] +; GFX942-NEXT: global_load_dwordx2 v[34:35], v0, s[38:39] ; GFX942-NEXT: .LBB15_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[42:43], 0x0 ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[42:43], 0x40 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 -; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 -; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 -; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 -; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 -; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 -; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 -; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 -; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 -; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 -; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 -; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 -; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 -; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: v_mov_b32_e32 v1, s17 +; GFX942-NEXT: v_mov_b32_e32 v2, s18 +; GFX942-NEXT: v_mov_b32_e32 v3, s19 +; GFX942-NEXT: v_mov_b32_e32 v4, s20 +; GFX942-NEXT: v_mov_b32_e32 v5, s21 +; GFX942-NEXT: v_mov_b32_e32 v6, s22 +; GFX942-NEXT: v_mov_b32_e32 v7, s23 +; GFX942-NEXT: v_mov_b32_e32 v8, s24 +; GFX942-NEXT: v_mov_b32_e32 v9, s25 +; GFX942-NEXT: v_mov_b32_e32 v10, s26 +; GFX942-NEXT: v_mov_b32_e32 v11, s27 +; GFX942-NEXT: v_mov_b32_e32 v12, s28 +; GFX942-NEXT: v_mov_b32_e32 v13, s29 +; GFX942-NEXT: v_mov_b32_e32 v14, s30 +; GFX942-NEXT: v_mov_b32_e32 v15, s31 +; GFX942-NEXT: v_mov_b32_e32 v16, s0 +; GFX942-NEXT: v_mov_b32_e32 v17, s1 +; GFX942-NEXT: v_mov_b32_e32 v18, s2 +; GFX942-NEXT: v_mov_b32_e32 v19, s3 +; GFX942-NEXT: v_mov_b32_e32 v20, s4 +; GFX942-NEXT: v_mov_b32_e32 v21, s5 +; GFX942-NEXT: v_mov_b32_e32 v22, s6 +; GFX942-NEXT: v_mov_b32_e32 v23, s7 +; GFX942-NEXT: v_mov_b32_e32 v24, s8 +; GFX942-NEXT: v_mov_b32_e32 v25, s9 +; GFX942-NEXT: v_mov_b32_e32 v26, s10 +; GFX942-NEXT: v_mov_b32_e32 v27, s11 +; GFX942-NEXT: v_mov_b32_e32 v28, s12 +; GFX942-NEXT: v_mov_b32_e32 v29, s13 +; GFX942-NEXT: v_mov_b32_e32 v30, s14 +; GFX942-NEXT: v_mov_b32_e32 v31, s15 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 v[0:31], v[34:35], v[34:35], v[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[40:41] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[40:41] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[40:41] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[40:41] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[40:41] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[40:41] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[40:41] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[40:41] +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[40:41] offset:112 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[40:41] offset:96 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[40:41] offset:80 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[40:41] offset:64 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[40:41] offset:48 +; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[40:41] offset:32 +; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[40:41] offset:16 +; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[40:41] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() |