diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
19 files changed, 1529 insertions, 1172 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 9e24023..ebbeab9 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -146,9 +146,9 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v39, a2 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v39 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use a3 v[0:31] ; GFX908-NEXT: ;;#ASMEND @@ -437,9 +437,9 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 { ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a2 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v33 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v35 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use a3 v[0:31] ; GFX908-NEXT: ;;#ASMEND @@ -1045,9 +1045,9 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v39, a2 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v39 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; use a3 v[0:31] ; GFX908-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir index a42cf43..7e82382d 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir @@ -40,8 +40,8 @@ body: | ; GFX908: liveins: $agpr0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: renamable $vgpr0 = COPY renamable $agpr0, implicit $exec - ; GFX908-NEXT: renamable $agpr1 = COPY renamable $vgpr0, implicit $exec - ; GFX908-NEXT: renamable $agpr2 = COPY renamable $vgpr0, implicit $exec + ; GFX908-NEXT: renamable $agpr1 = COPY $agpr0, implicit $exec + ; GFX908-NEXT: renamable $agpr2 = COPY $agpr0, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0, implicit $agpr1, implicit $agpr2 ; ; GFX90A-LABEL: name: do_not_propagate_agpr_to_agpr diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll index c4479b3..e3bc516 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll @@ -15,6 +15,9 @@ ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s +; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1250 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX1250 %s +; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1250 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX1250 %s + ; NO-SRAM-ECC-GFX906: Flags [ ; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_FEATURE_XNACK_V3 (0x100) ; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) @@ -52,6 +55,11 @@ ; SRAM-ECC-GFX950: EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F) ; SRAM-ECC-GFX950: ] +; SRAM-ECC-GFX1250: Flags [ +; SRAM-ECC-GFX1250: EF_AMDGPU_FEATURE_SRAMECC_V3 (0x200) +; SRAM-ECC-GFX1250: EF_AMDGPU_MACH_AMDGCN_GFX1250 (0x49) +; SRAM-ECC-GFX1250: ] + define amdgpu_kernel void @elf_header() { ret void } diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll index ab51693..05d3e9c3 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -497,12 +497,10 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(ptr addrspace ret void } -; FIXME: Should there be more checks here? minnum with NaN operand is simplified away. +; FIXME: Should there be more checks here? minnum with sNaN operand is simplified to qNaN. ; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32: -; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]] -; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]] -; GFX9: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]] +; GCN: v_mov_b32_e32 v{{.+}}, 0x7fc00000 define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1) %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 3de6df2..833be20 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -1949,8 +1949,7 @@ define float @v_fneg_self_minimumnum_f32_ieee(float %a) #0 { ; GCN-LABEL: v_fneg_self_minimumnum_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GCN-NEXT: v_max_f32_e32 v0, v0, v0 +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minimumnum.f32(float %a, float %a) %min.fneg = fneg float %min @@ -1961,7 +1960,7 @@ define float @v_fneg_self_minimumnum_f32_no_ieee(float %a) #4 { ; GCN-LABEL: v_fneg_self_minimumnum_f32_no_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_max_f32_e64 v0, -v0, -v0 +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minimumnum.f32(float %a, float %a) %min.fneg = fneg float %min @@ -2285,8 +2284,7 @@ define float @v_fneg_self_maximumnum_f32_ieee(float %a) #0 { ; GCN-LABEL: v_fneg_self_maximumnum_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GCN-NEXT: v_min_f32_e32 v0, v0, v0 +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %max = call float @llvm.maximumnum.f32(float %a, float %a) %max.fneg = fneg float %max @@ -2297,7 +2295,7 @@ define float @v_fneg_self_maximumnum_f32_no_ieee(float %a) #4 { ; GCN-LABEL: v_fneg_self_maximumnum_f32_no_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_min_f32_e64 v0, -v0, -v0 +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %max = call float @llvm.maximumnum.f32(float %a, float %a) %max.fneg = fneg float %max diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll index 87a7c2e..cc4cc8e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll @@ -72,5 +72,206 @@ define <4 x float> @request_no_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x flo ret <4 x float> %result } +; Make sure this selects the VGPR form, if AGPRs available, but not +; enough. +define amdgpu_kernel void @not_enough_agprs(ptr addrspace(1) %arg) #2 { +; HEURRC-LABEL: not_enough_agprs: +; HEURRC: ; %bb.0: ; %bb +; HEURRC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; HEURRC-NEXT: v_mov_b32_e32 v33, 1.0 +; HEURRC-NEXT: v_mov_b32_e32 v34, 2.0 +; HEURRC-NEXT: v_mov_b32_e32 v32, 0 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; HEURRC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b32_e32 v0, s16 +; HEURRC-NEXT: v_mov_b32_e32 v1, s17 +; HEURRC-NEXT: v_mov_b32_e32 v2, s18 +; HEURRC-NEXT: v_mov_b32_e32 v3, s19 +; HEURRC-NEXT: v_mov_b32_e32 v4, s20 +; HEURRC-NEXT: v_mov_b32_e32 v5, s21 +; HEURRC-NEXT: v_mov_b32_e32 v6, s22 +; HEURRC-NEXT: v_mov_b32_e32 v7, s23 +; HEURRC-NEXT: v_mov_b32_e32 v8, s24 +; HEURRC-NEXT: v_mov_b32_e32 v9, s25 +; HEURRC-NEXT: v_mov_b32_e32 v10, s26 +; HEURRC-NEXT: v_mov_b32_e32 v11, s27 +; HEURRC-NEXT: v_mov_b32_e32 v12, s28 +; HEURRC-NEXT: v_mov_b32_e32 v13, s29 +; HEURRC-NEXT: v_mov_b32_e32 v14, s30 +; HEURRC-NEXT: v_mov_b32_e32 v15, s31 +; HEURRC-NEXT: v_mov_b32_e32 v16, s0 +; HEURRC-NEXT: v_mov_b32_e32 v17, s1 +; HEURRC-NEXT: v_mov_b32_e32 v18, s2 +; HEURRC-NEXT: v_mov_b32_e32 v19, s3 +; HEURRC-NEXT: v_mov_b32_e32 v20, s4 +; HEURRC-NEXT: v_mov_b32_e32 v21, s5 +; HEURRC-NEXT: v_mov_b32_e32 v22, s6 +; HEURRC-NEXT: v_mov_b32_e32 v23, s7 +; HEURRC-NEXT: v_mov_b32_e32 v24, s8 +; HEURRC-NEXT: v_mov_b32_e32 v25, s9 +; HEURRC-NEXT: v_mov_b32_e32 v26, s10 +; HEURRC-NEXT: v_mov_b32_e32 v27, s11 +; HEURRC-NEXT: v_mov_b32_e32 v28, s12 +; HEURRC-NEXT: v_mov_b32_e32 v29, s13 +; HEURRC-NEXT: v_mov_b32_e32 v30, s14 +; HEURRC-NEXT: v_mov_b32_e32 v31, s15 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: s_nop 15 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; HEURRC-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; HEURRC-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; HEURRC-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; HEURRC-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; HEURRC-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: not_enough_agprs: +; VGPRRC: ; %bb.0: ; %bb +; VGPRRC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; VGPRRC-NEXT: v_mov_b32_e32 v33, 1.0 +; VGPRRC-NEXT: v_mov_b32_e32 v34, 2.0 +; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; VGPRRC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b32_e32 v0, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s19 +; VGPRRC-NEXT: v_mov_b32_e32 v4, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v5, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s23 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s27 +; VGPRRC-NEXT: v_mov_b32_e32 v12, s28 +; VGPRRC-NEXT: v_mov_b32_e32 v13, s29 +; VGPRRC-NEXT: v_mov_b32_e32 v14, s30 +; VGPRRC-NEXT: v_mov_b32_e32 v15, s31 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s3 +; VGPRRC-NEXT: v_mov_b32_e32 v20, s4 +; VGPRRC-NEXT: v_mov_b32_e32 v21, s5 +; VGPRRC-NEXT: v_mov_b32_e32 v22, s6 +; VGPRRC-NEXT: v_mov_b32_e32 v23, s7 +; VGPRRC-NEXT: v_mov_b32_e32 v24, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v25, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v26, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v27, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v28, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v29, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v30, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v31, s15 +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 +; VGPRRC-NEXT: s_nop 15 +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 +; VGPRRC-NEXT: s_endpgm +bb: + %in.1 = load <32 x float>, ptr addrspace(1) %arg, align 128 + %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %in.1, i32 1, i32 2, i32 3) + store <32 x float> %mai.1, ptr addrspace(1) %arg, align 128 + ret void +} + +define <16 x float> @mfma_scale_respect_flag(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #2 { +; HEURRC-LABEL: mfma_scale_respect_flag: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: scratch_load_dword a15, off, s32 +; HEURRC-NEXT: scratch_load_dword v31, off, s32 offset:8 +; HEURRC-NEXT: scratch_load_dword v32, off, s32 offset:4 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v16 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v17 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v18 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v19 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, v20 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, v21 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, v22 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, v23 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, v24 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, v25 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, v26 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, v27 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, v28 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, v29 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, v30 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; HEURRC-NEXT: s_nop 15 +; HEURRC-NEXT: s_nop 3 +; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 +; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 +; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 +; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3 +; HEURRC-NEXT: v_accvgpr_read_b32 v4, a4 +; HEURRC-NEXT: v_accvgpr_read_b32 v5, a5 +; HEURRC-NEXT: v_accvgpr_read_b32 v6, a6 +; HEURRC-NEXT: v_accvgpr_read_b32 v7, a7 +; HEURRC-NEXT: v_accvgpr_read_b32 v8, a8 +; HEURRC-NEXT: v_accvgpr_read_b32 v9, a9 +; HEURRC-NEXT: v_accvgpr_read_b32 v10, a10 +; HEURRC-NEXT: v_accvgpr_read_b32 v11, a11 +; HEURRC-NEXT: v_accvgpr_read_b32 v12, a12 +; HEURRC-NEXT: v_accvgpr_read_b32 v13, a13 +; HEURRC-NEXT: v_accvgpr_read_b32 v14, a14 +; HEURRC-NEXT: v_accvgpr_read_b32 v15, a15 +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: mfma_scale_respect_flag: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: scratch_load_dword v31, off, s32 +; VGPRRC-NEXT: scratch_load_dword v32, off, s32 offset:8 +; VGPRRC-NEXT: scratch_load_dword v33, off, s32 offset:4 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] +; VGPRRC-NEXT: s_nop 15 +; VGPRRC-NEXT: s_nop 3 +; VGPRRC-NEXT: v_mov_b32_e32 v0, v16 +; VGPRRC-NEXT: v_mov_b32_e32 v1, v17 +; VGPRRC-NEXT: v_mov_b32_e32 v2, v18 +; VGPRRC-NEXT: v_mov_b32_e32 v3, v19 +; VGPRRC-NEXT: v_mov_b32_e32 v4, v20 +; VGPRRC-NEXT: v_mov_b32_e32 v5, v21 +; VGPRRC-NEXT: v_mov_b32_e32 v6, v22 +; VGPRRC-NEXT: v_mov_b32_e32 v7, v23 +; VGPRRC-NEXT: v_mov_b32_e32 v8, v24 +; VGPRRC-NEXT: v_mov_b32_e32 v9, v25 +; VGPRRC-NEXT: v_mov_b32_e32 v10, v26 +; VGPRRC-NEXT: v_mov_b32_e32 v11, v27 +; VGPRRC-NEXT: v_mov_b32_e32 v12, v28 +; VGPRRC-NEXT: v_mov_b32_e32 v13, v29 +; VGPRRC-NEXT: v_mov_b32_e32 v14, v30 +; VGPRRC-NEXT: v_mov_b32_e32 v15, v31 +; VGPRRC-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + attributes #0 = { "amdgpu-agpr-alloc"="32,256" } attributes #1 = { "amdgpu-agpr-alloc"="0,0" } +attributes #2 = { nounwind "amdgpu-agpr-alloc"="20" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index 5ab8706..22bc62a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -726,12 +726,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0 +; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], 0 ; GFX90A-VGPR-NEXT: s_nop 3 -; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-VGPR-NEXT: s_nop 7 ; GFX90A-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -742,12 +742,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[6:7] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[4:5], v[0:1], v[2:3], 0 +; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], 0 ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -765,10 +765,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 ; GFX90A-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s11 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v1, s11 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 @@ -779,7 +779,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 @@ -792,10 +792,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: v_mov_b32_e32 v0, s10 ; GFX942-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX942-NEXT: v_mov_b32_e32 v3, s11 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX942-NEXT: v_mov_b32_e32 v1, s11 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[12:13] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 @@ -806,7 +806,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 @@ -819,17 +819,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s10 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, s10 ; GFX90A-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s11 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, s11 +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 0 @@ -842,17 +842,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s10 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s10 ; GFX942-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s11 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[12:13] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s11 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 @@ -1629,20 +1629,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9] +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 @@ -1657,20 +1657,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 @@ -1743,20 +1743,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9] +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 @@ -1771,20 +1771,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll index dc4c9291..2fb677e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll @@ -1445,20 +1445,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, < ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX942-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s6 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_f16: @@ -1485,20 +1485,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, < ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX950-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s6 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_f16: @@ -1577,11 +1577,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19] ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s24 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s24 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-SDAG-NEXT: s_nop 9 ; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 @@ -1635,11 +1635,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19] ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s24 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s24 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1650,7 +1650,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-SDAG-NEXT: s_nop 10 ; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 @@ -1847,20 +1847,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX942-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s6 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16: @@ -1887,20 +1887,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX950-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s6 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16: @@ -1979,11 +1979,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19] ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s24 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s24 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-SDAG-NEXT: s_nop 9 ; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 @@ -2037,11 +2037,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 ; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19] ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s24 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s24 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -2052,7 +2052,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-SDAG-NEXT: s_nop 10 ; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 033a35f..13a96cf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -15,15 +15,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[8:9], 48 -; GCN-NEXT: v_mov_b64_e32 v[10:11], 32 -; GCN-NEXT: v_mov_b64_e32 v[12:13], 16 +; GCN-NEXT: v_mov_b64_e32 v[0:1], 48 +; GCN-NEXT: v_mov_b64_e32 v[2:3], 32 +; GCN-NEXT: v_mov_b64_e32 v[4:5], 16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; GCN-NEXT: v_accvgpr_write_b32 a0, s8 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; GCN-NEXT: v_accvgpr_write_b32 a1, s9 ; GCN-NEXT: v_accvgpr_write_b32 a2, s10 ; GCN-NEXT: v_accvgpr_write_b32 a3, s11 @@ -41,40 +41,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: v_accvgpr_write_b32 a15, s23 ; GCN-NEXT: v_mov_b32_e32 v16, s16 ; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[8:11], v[12:15], a[0:15] ; GCN-NEXT: v_mov_b32_e32 v18, s18 ; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NEXT: v_mov_b32_e32 v3, s23 -; GCN-NEXT: v_mov_b64_e32 v[14:15], 0 +; GCN-NEXT: v_mov_b32_e32 v8, s20 +; GCN-NEXT: v_mov_b32_e32 v9, s21 +; GCN-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NEXT: v_mov_b64_e32 v[6:7], 0 ; GCN-NEXT: s_nop 4 -; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) @@ -88,15 +87,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[8:9], 48 -; GCN-NEXT: v_mov_b64_e32 v[10:11], 32 -; GCN-NEXT: v_mov_b64_e32 v[12:13], 16 +; GCN-NEXT: v_mov_b64_e32 v[0:1], 48 +; GCN-NEXT: v_mov_b64_e32 v[2:3], 32 +; GCN-NEXT: v_mov_b64_e32 v[4:5], 16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; GCN-NEXT: v_accvgpr_write_b32 a0, s8 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; GCN-NEXT: v_accvgpr_write_b32 a1, s9 ; GCN-NEXT: v_accvgpr_write_b32 a2, s10 ; GCN-NEXT: v_accvgpr_write_b32 a3, s11 @@ -114,40 +113,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN-NEXT: v_accvgpr_write_b32 a15, s23 ; GCN-NEXT: v_mov_b32_e32 v16, s16 ; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 ; GCN-NEXT: v_mov_b32_e32 v18, s18 ; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NEXT: v_mov_b32_e32 v3, s23 -; GCN-NEXT: v_mov_b64_e32 v[14:15], 0 +; GCN-NEXT: v_mov_b32_e32 v8, s20 +; GCN-NEXT: v_mov_b32_e32 v9, s21 +; GCN-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NEXT: v_mov_b64_e32 v[6:7], 0 ; GCN-NEXT: s_nop 4 -; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1) @@ -250,13 +248,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v44, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[40:41], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[38:39], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[44:45], s[30:31] ; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[42:43], s[28:29] ; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -264,41 +262,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GCN-NEXT: v_mov_b32_e32 v40, s20 -; GCN-NEXT: v_mov_b32_e32 v41, s21 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] -; GCN-NEXT: v_mov_b32_e32 v42, s22 -; GCN-NEXT: v_mov_b32_e32 v43, s23 -; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v32, s20 +; GCN-NEXT: v_mov_b32_e32 v33, s21 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[38:41], v[42:45], v[16:31] +; GCN-NEXT: v_mov_b32_e32 v34, s22 +; GCN-NEXT: v_mov_b32_e32 v35, s23 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_mov_b32_e32 v16, s16 ; GCN-NEXT: v_mov_b32_e32 v17, s17 ; GCN-NEXT: v_mov_b32_e32 v18, s18 ; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v16, s12 ; GCN-NEXT: v_mov_b32_e32 v17, s13 ; GCN-NEXT: v_mov_b32_e32 v18, s14 ; GCN-NEXT: v_mov_b32_e32 v19, s15 -; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v16, s8 ; GCN-NEXT: v_mov_b32_e32 v17, s9 ; GCN-NEXT: v_mov_b32_e32 v18, s10 ; GCN-NEXT: v_mov_b32_e32 v19, s11 -; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) @@ -313,13 +311,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v44, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[40:41], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[38:39], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[44:45], s[30:31] ; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[42:43], s[28:29] ; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -327,41 +325,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GCN-NEXT: v_mov_b32_e32 v40, s20 -; GCN-NEXT: v_mov_b32_e32 v41, s21 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v42, s22 -; GCN-NEXT: v_mov_b32_e32 v43, s23 -; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v32, s20 +; GCN-NEXT: v_mov_b32_e32 v33, s21 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v34, s22 +; GCN-NEXT: v_mov_b32_e32 v35, s23 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 2 ; GCN-NEXT: v_mov_b32_e32 v16, s16 ; GCN-NEXT: v_mov_b32_e32 v17, s17 ; GCN-NEXT: v_mov_b32_e32 v18, s18 ; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v16, s12 ; GCN-NEXT: v_mov_b32_e32 v17, s13 ; GCN-NEXT: v_mov_b32_e32 v18, s14 ; GCN-NEXT: v_mov_b32_e32 v19, s15 -; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v16, s8 ; GCN-NEXT: v_mov_b32_e32 v17, s9 ; GCN-NEXT: v_mov_b32_e32 v18, s10 ; GCN-NEXT: v_mov_b32_e32 v19, s11 -; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 7532062..ab0000f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -141,18 +141,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -179,18 +179,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: v_mov_b32_e32 v4, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -198,18 +198,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -260,18 +260,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -298,18 +298,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: v_mov_b32_e32 v4, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -317,18 +317,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: @@ -382,15 +382,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 -; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -408,40 +408,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] ; SDAG-NEXT: v_mov_b32_e32 v18, s18 ; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 ; SDAG-NEXT: s_nop 4 -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -508,15 +507,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48 -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32 -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16 +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48 +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32 +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 ; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 @@ -534,40 +533,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 ; HEURRC-NEXT: v_mov_b32_e32 v16, s16 ; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] ; HEURRC-NEXT: v_mov_b32_e32 v18, s18 ; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: v_mov_b32_e32 v0, s20 -; HEURRC-NEXT: v_mov_b32_e32 v1, s21 -; HEURRC-NEXT: v_mov_b32_e32 v2, s22 -; HEURRC-NEXT: v_mov_b32_e32 v3, s23 -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, s20 +; HEURRC-NEXT: v_mov_b32_e32 v9, s21 +; HEURRC-NEXT: v_mov_b32_e32 v10, s22 +; HEURRC-NEXT: v_mov_b32_e32 v11, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0 ; HEURRC-NEXT: s_nop 4 -; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v0, s8 ; HEURRC-NEXT: v_mov_b32_e32 v1, s9 ; HEURRC-NEXT: v_mov_b32_e32 v2, s10 ; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v0, s12 ; HEURRC-NEXT: v_mov_b32_e32 v1, s13 ; HEURRC-NEXT: v_mov_b32_e32 v2, s14 ; HEURRC-NEXT: v_mov_b32_e32 v3, s15 -; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -575,15 +573,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48 -; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32 -; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16 +; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48 +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32 +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -593,40 +591,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; VGPRRC-NEXT: v_mov_b32_e32 v48, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v49, s17 -; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[44:47], v[0:15] ; VGPRRC-NEXT: v_mov_b32_e32 v50, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v51, s19 -; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0 +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0 ; VGPRRC-NEXT: s_nop 8 -; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: v_mov_b32_e32 v0, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s23 -; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[48:51], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16: @@ -765,15 +763,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 -; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -791,40 +789,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 ; SDAG-NEXT: v_mov_b32_e32 v18, s18 ; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 ; SDAG-NEXT: s_nop 4 -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -891,15 +888,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48 -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32 -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16 +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48 +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32 +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29] ; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31] ; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 ; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 @@ -917,40 +914,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 ; HEURRC-NEXT: v_mov_b32_e32 v16, s16 ; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 ; HEURRC-NEXT: v_mov_b32_e32 v18, s18 ; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: v_mov_b32_e32 v0, s20 -; HEURRC-NEXT: v_mov_b32_e32 v1, s21 -; HEURRC-NEXT: v_mov_b32_e32 v2, s22 -; HEURRC-NEXT: v_mov_b32_e32 v3, s23 -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, s20 +; HEURRC-NEXT: v_mov_b32_e32 v9, s21 +; HEURRC-NEXT: v_mov_b32_e32 v10, s22 +; HEURRC-NEXT: v_mov_b32_e32 v11, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0 ; HEURRC-NEXT: s_nop 4 -; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v0, s8 ; HEURRC-NEXT: v_mov_b32_e32 v1, s9 ; HEURRC-NEXT: v_mov_b32_e32 v2, s10 ; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v0, s12 ; HEURRC-NEXT: v_mov_b32_e32 v1, s13 ; HEURRC-NEXT: v_mov_b32_e32 v2, s14 ; HEURRC-NEXT: v_mov_b32_e32 v3, s15 -; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -958,15 +954,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48 -; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32 -; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16 +; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48 +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32 +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -976,40 +972,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; VGPRRC-NEXT: v_mov_b32_e32 v48, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v49, s17 -; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[44:47], v[0:15] cbsz:2 abid:3 blgp:1 ; VGPRRC-NEXT: v_mov_b32_e32 v50, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v51, s19 -; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0 +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0 ; VGPRRC-NEXT: s_nop 8 -; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: v_mov_b32_e32 v0, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s23 -; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[48:51], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags: @@ -1489,13 +1485,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v44, 0 +; SDAG-NEXT: v_mov_b32_e32 v36, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[40:41], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[44:45], s[30:31] ; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[42:43], s[28:29] ; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -1503,41 +1499,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; SDAG-NEXT: v_mov_b32_e32 v40, s20 -; SDAG-NEXT: v_mov_b32_e32 v41, s21 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] -; SDAG-NEXT: v_mov_b32_e32 v42, s22 -; SDAG-NEXT: v_mov_b32_e32 v43, s23 -; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: v_mov_b32_e32 v17, s17 ; SDAG-NEXT: v_mov_b32_e32 v18, s18 ; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v16, s12 ; SDAG-NEXT: v_mov_b32_e32 v17, s13 ; SDAG-NEXT: v_mov_b32_e32 v18, s14 ; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v16, s8 ; SDAG-NEXT: v_mov_b32_e32 v17, s9 ; SDAG-NEXT: v_mov_b32_e32 v18, s10 ; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1592,13 +1588,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v44, 0 +; HEURRC-NEXT: v_mov_b32_e32 v36, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31] ; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29] ; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -1606,41 +1602,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; HEURRC-NEXT: v_mov_b32_e32 v40, s20 -; HEURRC-NEXT: v_mov_b32_e32 v41, s21 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] -; HEURRC-NEXT: v_mov_b32_e32 v42, s22 -; HEURRC-NEXT: v_mov_b32_e32 v43, s23 -; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 2 ; HEURRC-NEXT: v_mov_b32_e32 v16, s16 ; HEURRC-NEXT: v_mov_b32_e32 v17, s17 ; HEURRC-NEXT: v_mov_b32_e32 v18, s18 ; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v16, s12 ; HEURRC-NEXT: v_mov_b32_e32 v17, s13 ; HEURRC-NEXT: v_mov_b32_e32 v18, s14 ; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v16, s8 ; HEURRC-NEXT: v_mov_b32_e32 v17, s9 ; HEURRC-NEXT: v_mov_b32_e32 v18, s10 ; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -1649,13 +1645,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v44, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v36, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31] ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -1663,41 +1659,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; VGPRRC-NEXT: v_mov_b32_e32 v40, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v41, s21 -; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] -; VGPRRC-NEXT: v_mov_b32_e32 v42, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v43, s23 -; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 2 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: @@ -1831,13 +1827,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v44, 0 +; SDAG-NEXT: v_mov_b32_e32 v36, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[40:41], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[44:45], s[30:31] ; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[42:43], s[28:29] ; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -1845,41 +1841,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; SDAG-NEXT: v_mov_b32_e32 v40, s20 -; SDAG-NEXT: v_mov_b32_e32 v41, s21 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: v_mov_b32_e32 v42, s22 -; SDAG-NEXT: v_mov_b32_e32 v43, s23 -; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: v_mov_b32_e32 v17, s17 ; SDAG-NEXT: v_mov_b32_e32 v18, s18 ; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v16, s12 ; SDAG-NEXT: v_mov_b32_e32 v17, s13 ; SDAG-NEXT: v_mov_b32_e32 v18, s14 ; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v16, s8 ; SDAG-NEXT: v_mov_b32_e32 v17, s9 ; SDAG-NEXT: v_mov_b32_e32 v18, s10 ; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1934,13 +1930,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v44, 0 +; HEURRC-NEXT: v_mov_b32_e32 v36, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31] ; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29] ; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -1948,41 +1944,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; HEURRC-NEXT: v_mov_b32_e32 v40, s20 -; HEURRC-NEXT: v_mov_b32_e32 v41, s21 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: v_mov_b32_e32 v42, s22 -; HEURRC-NEXT: v_mov_b32_e32 v43, s23 -; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 2 ; HEURRC-NEXT: v_mov_b32_e32 v16, s16 ; HEURRC-NEXT: v_mov_b32_e32 v17, s17 ; HEURRC-NEXT: v_mov_b32_e32 v18, s18 ; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v16, s12 ; HEURRC-NEXT: v_mov_b32_e32 v17, s13 ; HEURRC-NEXT: v_mov_b32_e32 v18, s14 ; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 ; HEURRC-NEXT: v_mov_b32_e32 v16, s8 ; HEURRC-NEXT: v_mov_b32_e32 v17, s9 ; HEURRC-NEXT: v_mov_b32_e32 v18, s10 ; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -1991,13 +1987,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v44, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v36, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31] ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -2005,41 +2001,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; VGPRRC-NEXT: v_mov_b32_e32 v40, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v41, s21 -; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; VGPRRC-NEXT: v_mov_b32_e32 v42, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v43, s23 -; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 2 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: @@ -5425,18 +5421,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5444,18 +5440,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: v_mov_b32_e32 v4, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5463,18 +5459,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -5525,18 +5521,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5544,18 +5540,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: v_mov_b32_e32 v4, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5563,18 +5559,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 6eb9449..ee11b92 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -17,24 +17,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[2:3] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[0:1] ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; SDAG-NEXT: v_mov_b32_e32 v5, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr: @@ -120,30 +120,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, < ; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s0 -; SDAG-NEXT: v_mov_b32_e32 v11, s1 -; SDAG-NEXT: v_mov_b32_e32 v12, s2 -; SDAG-NEXT: v_mov_b32_e32 v13, s3 -; SDAG-NEXT: v_mov_b32_e32 v2, s16 -; SDAG-NEXT: v_mov_b32_e32 v3, s17 -; SDAG-NEXT: v_mov_b32_e32 v4, s18 -; SDAG-NEXT: v_mov_b32_e32 v5, s19 -; SDAG-NEXT: v_mov_b32_e32 v6, s20 -; SDAG-NEXT: v_mov_b32_e32 v7, s21 -; SDAG-NEXT: v_mov_b32_e32 v8, s22 -; SDAG-NEXT: v_mov_b32_e32 v9, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v0, s28 +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[10:13], v[2:9], v0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__sgpr: @@ -187,17 +182,17 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[2:3] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[0:1] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[2:3] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[0:1] ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[8:9] +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -436,53 +431,37 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-NEXT: v_mov_b32_e32 v25, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: @@ -541,24 +520,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[2:3] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_mov_b32_e32 v17, s16 +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; GCN-NEXT: v_mov_b32_e32 v5, s16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -618,30 +597,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, s0 -; GCN-NEXT: v_mov_b32_e32 v11, s1 -; GCN-NEXT: v_mov_b32_e32 v12, s2 -; GCN-NEXT: v_mov_b32_e32 v13, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NEXT: v_mov_b32_e32 v4, s18 -; GCN-NEXT: v_mov_b32_e32 v5, s19 -; GCN-NEXT: v_mov_b32_e32 v6, s20 -; GCN-NEXT: v_mov_b32_e32 v7, s21 -; GCN-NEXT: v_mov_b32_e32 v8, s22 -; GCN-NEXT: v_mov_b32_e32 v9, s23 -; GCN-NEXT: v_accvgpr_write_b32 a0, s24 -; GCN-NEXT: v_accvgpr_write_b32 a1, s25 -; GCN-NEXT: v_accvgpr_write_b32 a2, s26 -; GCN-NEXT: v_accvgpr_write_b32 a3, s27 -; GCN-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NEXT: v_mov_b32_e32 v14, s0 +; GCN-NEXT: v_mov_b32_e32 v15, s1 +; GCN-NEXT: v_mov_b32_e32 v16, s2 +; GCN-NEXT: v_mov_b32_e32 v17, s3 +; GCN-NEXT: v_mov_b32_e32 v6, s16 +; GCN-NEXT: v_mov_b32_e32 v7, s17 +; GCN-NEXT: v_mov_b32_e32 v8, s18 +; GCN-NEXT: v_mov_b32_e32 v9, s19 +; GCN-NEXT: v_mov_b32_e32 v10, s20 +; GCN-NEXT: v_mov_b32_e32 v11, s21 +; GCN-NEXT: v_mov_b32_e32 v12, s22 +; GCN-NEXT: v_mov_b32_e32 v13, s23 +; GCN-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NEXT: v_mov_b32_e32 v4, s28 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[10:13], v[2:9], v0 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v4 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -667,17 +641,17 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[26:27], s[2:3] -; GCN-NEXT: v_mov_b64_e32 v[24:25], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[28:29], s[2:3] +; GCN-NEXT: v_mov_b64_e32 v[26:27], s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GCN-NEXT: v_mov_b32_e32 v28, s16 +; GCN-NEXT: v_mov_b64_e32 v[24:25], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[8:9] +; GCN-NEXT: v_mov_b32_e32 v16, s16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_nop 10 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -779,53 +753,37 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v36, s0 -; GCN-NEXT: v_mov_b32_e32 v37, s1 -; GCN-NEXT: v_mov_b32_e32 v38, s2 -; GCN-NEXT: v_mov_b32_e32 v39, s3 -; GCN-NEXT: v_mov_b32_e32 v13, s25 -; GCN-NEXT: v_mov_b32_e32 v14, s26 -; GCN-NEXT: v_mov_b32_e32 v15, s27 -; GCN-NEXT: v_mov_b32_e32 v16, s28 -; GCN-NEXT: v_mov_b32_e32 v17, s29 -; GCN-NEXT: v_mov_b32_e32 v28, s16 -; GCN-NEXT: v_mov_b32_e32 v29, s17 -; GCN-NEXT: v_mov_b32_e32 v30, s18 -; GCN-NEXT: v_mov_b32_e32 v31, s19 -; GCN-NEXT: v_mov_b32_e32 v32, s20 -; GCN-NEXT: v_mov_b32_e32 v33, s21 -; GCN-NEXT: v_mov_b32_e32 v34, s22 -; GCN-NEXT: v_mov_b32_e32 v35, s23 -; GCN-NEXT: v_mov_b32_e32 v12, s24 -; GCN-NEXT: v_mov_b32_e32 v18, v0 -; GCN-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v22, v4 -; GCN-NEXT: v_mov_b32_e32 v23, v5 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v7 -; GCN-NEXT: v_mov_b32_e32 v26, v8 -; GCN-NEXT: v_mov_b32_e32 v27, v9 +; GCN-NEXT: v_mov_b32_e32 v26, s0 +; GCN-NEXT: v_mov_b32_e32 v27, s1 +; GCN-NEXT: v_mov_b32_e32 v28, s2 +; GCN-NEXT: v_mov_b32_e32 v29, s3 +; GCN-NEXT: v_mov_b32_e32 v16, v10 +; GCN-NEXT: v_mov_b32_e32 v15, v9 +; GCN-NEXT: v_mov_b32_e32 v14, v8 +; GCN-NEXT: v_mov_b32_e32 v13, v7 +; GCN-NEXT: v_mov_b32_e32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v11, v5 +; GCN-NEXT: v_mov_b32_e32 v10, v4 +; GCN-NEXT: v_mov_b32_e32 v9, v3 +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v7, v1 +; GCN-NEXT: v_mov_b32_e32 v6, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NEXT: v_mov_b32_e32 v4, s28 +; GCN-NEXT: v_mov_b32_e32 v5, s29 +; GCN-NEXT: v_mov_b32_e32 v18, s16 +; GCN-NEXT: v_mov_b32_e32 v19, s17 +; GCN-NEXT: v_mov_b32_e32 v20, s18 +; GCN-NEXT: v_mov_b32_e32 v21, s19 +; GCN-NEXT: v_mov_b32_e32 v22, s20 +; GCN-NEXT: v_mov_b32_e32 v23, s21 +; GCN-NEXT: v_mov_b32_e32 v24, s22 +; GCN-NEXT: v_mov_b32_e32 v25, s23 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10 -; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -953,30 +911,25 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s0 -; SDAG-NEXT: v_mov_b32_e32 v11, s1 -; SDAG-NEXT: v_mov_b32_e32 v12, s2 -; SDAG-NEXT: v_mov_b32_e32 v13, s3 -; SDAG-NEXT: v_mov_b32_e32 v2, s16 -; SDAG-NEXT: v_mov_b32_e32 v3, s17 -; SDAG-NEXT: v_mov_b32_e32 v4, s18 -; SDAG-NEXT: v_mov_b32_e32 v5, s19 -; SDAG-NEXT: v_mov_b32_e32 v6, s20 -; SDAG-NEXT: v_mov_b32_e32 v7, s21 -; SDAG-NEXT: v_mov_b32_e32 v8, s22 -; SDAG-NEXT: v_mov_b32_e32 v9, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v0, s28 +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[10:13], v[2:9], v0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[14:17], v[6:13], v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr: @@ -1275,53 +1228,37 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-NEXT: v_mov_b32_e32 v25, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: @@ -1489,30 +1426,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s0 -; SDAG-NEXT: v_mov_b32_e32 v11, s1 -; SDAG-NEXT: v_mov_b32_e32 v12, s2 -; SDAG-NEXT: v_mov_b32_e32 v13, s3 -; SDAG-NEXT: v_mov_b32_e32 v2, s16 -; SDAG-NEXT: v_mov_b32_e32 v3, s17 -; SDAG-NEXT: v_mov_b32_e32 v4, s18 -; SDAG-NEXT: v_mov_b32_e32 v5, s19 -; SDAG-NEXT: v_mov_b32_e32 v6, s20 -; SDAG-NEXT: v_mov_b32_e32 v7, s21 -; SDAG-NEXT: v_mov_b32_e32 v8, s22 -; SDAG-NEXT: v_mov_b32_e32 v9, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v0, s28 +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[10:13], v[2:9], v0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[14:17], v[6:13], v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr: @@ -1658,30 +1590,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s0 -; SDAG-NEXT: v_mov_b32_e32 v11, s1 -; SDAG-NEXT: v_mov_b32_e32 v12, s2 -; SDAG-NEXT: v_mov_b32_e32 v13, s3 -; SDAG-NEXT: v_mov_b32_e32 v2, s16 -; SDAG-NEXT: v_mov_b32_e32 v3, s17 -; SDAG-NEXT: v_mov_b32_e32 v4, s18 -; SDAG-NEXT: v_mov_b32_e32 v5, s19 -; SDAG-NEXT: v_mov_b32_e32 v6, s20 -; SDAG-NEXT: v_mov_b32_e32 v7, s21 -; SDAG-NEXT: v_mov_b32_e32 v8, s22 -; SDAG-NEXT: v_mov_b32_e32 v9, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v0, s28 +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[10:13], v[2:9], v0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[14:17], v[6:13], v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr: @@ -1827,30 +1754,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s0 -; SDAG-NEXT: v_mov_b32_e32 v11, s1 -; SDAG-NEXT: v_mov_b32_e32 v12, s2 -; SDAG-NEXT: v_mov_b32_e32 v13, s3 -; SDAG-NEXT: v_mov_b32_e32 v2, s16 -; SDAG-NEXT: v_mov_b32_e32 v3, s17 -; SDAG-NEXT: v_mov_b32_e32 v4, s18 -; SDAG-NEXT: v_mov_b32_e32 v5, s19 -; SDAG-NEXT: v_mov_b32_e32 v6, s20 -; SDAG-NEXT: v_mov_b32_e32 v7, s21 -; SDAG-NEXT: v_mov_b32_e32 v8, s22 -; SDAG-NEXT: v_mov_b32_e32 v9, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v0, s28 +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[10:13], v[2:9], v0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[14:17], v[6:13], v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr: @@ -1996,30 +1918,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s0 -; SDAG-NEXT: v_mov_b32_e32 v11, s1 -; SDAG-NEXT: v_mov_b32_e32 v12, s2 -; SDAG-NEXT: v_mov_b32_e32 v13, s3 -; SDAG-NEXT: v_mov_b32_e32 v2, s16 -; SDAG-NEXT: v_mov_b32_e32 v3, s17 -; SDAG-NEXT: v_mov_b32_e32 v4, s18 -; SDAG-NEXT: v_mov_b32_e32 v5, s19 -; SDAG-NEXT: v_mov_b32_e32 v6, s20 -; SDAG-NEXT: v_mov_b32_e32 v7, s21 -; SDAG-NEXT: v_mov_b32_e32 v8, s22 -; SDAG-NEXT: v_mov_b32_e32 v9, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v0, s28 +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[10:13], v[2:9], v0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[14:17], v[6:13], v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr: @@ -2318,53 +2235,37 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-NEXT: v_mov_b32_e32 v25, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: @@ -2685,53 +2586,37 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-NEXT: v_mov_b32_e32 v25, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: @@ -3052,53 +2937,37 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-NEXT: v_mov_b32_e32 v25, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: @@ -3419,53 +3288,37 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v36, s0 -; SDAG-NEXT: v_mov_b32_e32 v37, s1 -; SDAG-NEXT: v_mov_b32_e32 v38, s2 -; SDAG-NEXT: v_mov_b32_e32 v39, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 +; SDAG-NEXT: v_mov_b32_e32 v13, v7 +; SDAG-NEXT: v_mov_b32_e32 v12, v6 +; SDAG-NEXT: v_mov_b32_e32 v11, v5 +; SDAG-NEXT: v_mov_b32_e32 v10, v4 +; SDAG-NEXT: v_mov_b32_e32 v9, v3 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v7, v1 +; SDAG-NEXT: v_mov_b32_e32 v6, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, s24 +; SDAG-NEXT: v_mov_b32_e32 v1, s25 +; SDAG-NEXT: v_mov_b32_e32 v2, s26 +; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v22, s20 +; SDAG-NEXT: v_mov_b32_e32 v23, s21 +; SDAG-NEXT: v_mov_b32_e32 v24, s22 +; SDAG-NEXT: v_mov_b32_e32 v25, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index 51cd564..f46116e 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -95,66 +95,66 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31] ; GREEDY908-NEXT: s_nop 15 ; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a32 -; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a61 -; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60 -; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a33 -; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a59 -; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a58 -; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a32 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a33 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a34 -; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a57 -; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a56 +; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v6 ; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a35 -; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a55 -; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a54 -; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a36 -; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a53 -; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a52 -; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a35 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a36 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a37 -; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a51 -; GREEDY908-NEXT: v_accvgpr_read_b32 v16, a50 +; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v6 ; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a38 -; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a49 -; GREEDY908-NEXT: v_accvgpr_read_b32 v18, a48 -; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a39 -; GREEDY908-NEXT: v_accvgpr_read_b32 v19, a47 -; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a46 -; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a38 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a39 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a40 -; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v19 +; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v6 ; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a41 -; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v18 -; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v17 -; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a42 -; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v16 -; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v15 -; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a41 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a42 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a43 -; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v14 -; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v13 +; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v6 ; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a44 -; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v12 -; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v11 -; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v1 -; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a45 -; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v10 -; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v9 -; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v1 -; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v8 -; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v7 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a44 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a45 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a46 +; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a47 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a48 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a49 +; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a50 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a51 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a52 +; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a53 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a54 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a55 +; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a56 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a57 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a58 +; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2 +; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v6 +; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1 +; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a59 +; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60 +; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a61 +; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v5 +; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; GREEDY908-NEXT: s_nop 15 @@ -667,11 +667,11 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] ; GREEDY908-NEXT: s_nop 8 +; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a18 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19 -; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a18 ; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v5 ; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GREEDY908-NEXT: s_nop 9 diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index cf244f0..be1788c 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -54,19 +54,20 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX908-NEXT: s_branch .LBB0_2 ; GFX908-NEXT: .LBB0_1: ; %bb2 ; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GFX908-NEXT: s_nop 6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a2 ; GFX908-NEXT: s_or_b32 s4, s3, 1 ; GFX908-NEXT: s_ashr_i32 s5, s3, 31 ; GFX908-NEXT: s_mov_b32 s3, s2 ; GFX908-NEXT: v_mov_b32_e32 v1, s2 -; GFX908-NEXT: s_nop 2 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a2 ; GFX908-NEXT: v_mov_b32_e32 v2, s3 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: s_and_b32 s3, s5, s4 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: s_and_b32 s3, s5, s4 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[1:2], v[1:2], a[0:3] ; GFX908-NEXT: s_cbranch_execz .LBB0_4 ; GFX908-NEXT: .LBB0_2: ; %bb diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll index 6b7d704..ede470b 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll @@ -1,13 +1,11 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefixes=CHECK,GFX11 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck --check-prefixes=CHECK,GFX12 %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr < %s | FileCheck --check-prefixes=CHECK,GFX12,DVGPR %s ; CHECK: .amdgpu_pal_metadata ; CHECK-NEXT: --- ; CHECK-NEXT: amdpal.pipelines: ; CHECK-NEXT: - .api: Vulkan ; CHECK-NEXT: .compute_registers: -; DVGPR-NEXT: .dynamic_vgpr_en: true ; CHECK-NEXT: .tg_size_en: true ; CHECK-NEXT: .tgid_x_en: false ; CHECK-NEXT: .tgid_y_en: false diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll index 5c0c366..5325499 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll @@ -1,17 +1,14 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11,NODVGPR -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK,NODVGPR -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr <%s | FileCheck %s --check-prefixes=CHECK,DVGPR +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11 +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK ; CHECK-LABEL: {{^}}_amdgpu_cs_main: -; NODVGPR: ; TotalNumSgprs: 4 -; DVGPR: ; TotalNumSgprs: 34 +; CHECK: ; TotalNumSgprs: 4 ; CHECK: ; NumVgprs: 2 ; CHECK: .amdgpu_pal_metadata ; CHECK-NEXT: --- ; CHECK-NEXT: amdpal.pipelines: ; CHECK-NEXT: - .api: Vulkan ; CHECK-NEXT: .compute_registers: -; DVGPR-NEXT: .dynamic_vgpr_en: true ; CHECK-NEXT: .tg_size_en: true ; CHECK-NEXT: .tgid_x_en: false ; CHECK-NEXT: .tgid_y_en: false @@ -57,7 +54,6 @@ ; CHECK-NEXT: .cs: ; CHECK-NEXT: .checksum_value: 0x9444d7d0 ; CHECK-NEXT: .debug_mode: false -; DVGPR-NEXT: .dynamic_vgpr_saved_count: 0x70 ; CHECK-NEXT: .entry_point: _amdgpu_cs_main ; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main ; CHECK-NEXT: .excp_en: 0 @@ -69,8 +65,7 @@ ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false ; CHECK-NEXT: .scratch_memory_size: 0 -; NODVGPR-NEXT: .sgpr_count: 0x4 -; DVGPR-NEXT: .sgpr_count: 0x22 +; CHECK-NEXT: .sgpr_count: 0x4 ; CHECK-NEXT: .sgpr_limit: 0x6a ; CHECK-NEXT: .threadgroup_dimensions: ; CHECK-NEXT: - 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-dvgpr.ll new file mode 100644 index 0000000..e598b0c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-dvgpr.ll @@ -0,0 +1,204 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK + +; CHECK-LABEL: {{^}}_amdgpu_cs_main: +; CHECK: ; TotalNumSgprs: 34 +; CHECK: ; NumVgprs: 2 +; CHECK: .amdgpu_pal_metadata +; CHECK-NEXT: --- +; CHECK-NEXT: amdpal.pipelines: +; CHECK-NEXT: - .api: Vulkan +; CHECK-NEXT: .compute_registers: +; CHECK-NEXT: .dynamic_vgpr_en: true +; CHECK-NEXT: .tg_size_en: true +; CHECK-NEXT: .tgid_x_en: false +; CHECK-NEXT: .tgid_y_en: false +; CHECK-NEXT: .tgid_z_en: false +; CHECK-NEXT: .tidig_comp_cnt: 0x1 +; CHECK-NEXT: .graphics_registers: +; CHECK-NEXT: .ps_extra_lds_size: 0 +; CHECK-NEXT: .spi_ps_input_addr: +; CHECK-NEXT: .ancillary_ena: false +; CHECK-NEXT: .front_face_ena: true +; CHECK-NEXT: .line_stipple_tex_ena: false +; CHECK-NEXT: .linear_center_ena: true +; CHECK-NEXT: .linear_centroid_ena: true +; CHECK-NEXT: .linear_sample_ena: true +; CHECK-NEXT: .persp_center_ena: true +; CHECK-NEXT: .persp_centroid_ena: true +; CHECK-NEXT: .persp_pull_model_ena: false +; CHECK-NEXT: .persp_sample_ena: true +; CHECK-NEXT: .pos_fixed_pt_ena: true +; CHECK-NEXT: .pos_w_float_ena: false +; CHECK-NEXT: .pos_x_float_ena: false +; CHECK-NEXT: .pos_y_float_ena: false +; CHECK-NEXT: .pos_z_float_ena: false +; CHECK-NEXT: .sample_coverage_ena: false +; CHECK-NEXT: .spi_ps_input_ena: +; CHECK-NEXT: .ancillary_ena: false +; CHECK-NEXT: .front_face_ena: false +; CHECK-NEXT: .line_stipple_tex_ena: false +; CHECK-NEXT: .linear_center_ena: false +; CHECK-NEXT: .linear_centroid_ena: false +; CHECK-NEXT: .linear_sample_ena: false +; CHECK-NEXT: .persp_center_ena: false +; CHECK-NEXT: .persp_centroid_ena: false +; CHECK-NEXT: .persp_pull_model_ena: false +; CHECK-NEXT: .persp_sample_ena: true +; CHECK-NEXT: .pos_fixed_pt_ena: false +; CHECK-NEXT: .pos_w_float_ena: false +; CHECK-NEXT: .pos_x_float_ena: false +; CHECK-NEXT: .pos_y_float_ena: false +; CHECK-NEXT: .pos_z_float_ena: false +; CHECK-NEXT: .sample_coverage_ena: false +; CHECK-NEXT: .hardware_stages: +; CHECK-NEXT: .cs: +; CHECK-NEXT: .checksum_value: 0x9444d7d0 +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .dynamic_vgpr_saved_count: 0x70 +; CHECK-NOT: .entry_point: _amdgpu_cs_main +; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main +; CHECK-NEXT: .excp_en: 0 +; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true +; GFX11-NEXT: .ieee_mode: false +; CHECK-NEXT: .image_op: false +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: false +; CHECK-NEXT: .scratch_memory_size: 0 +; CHECK-NEXT: .sgpr_count: 0x22 +; CHECK-NEXT: .sgpr_limit: 0x6a +; CHECK-NEXT: .threadgroup_dimensions: +; CHECK-NEXT: - 0x1 +; CHECK-NEXT: - 0x400 +; CHECK-NEXT: - 0x1 +; CHECK-NEXT: .trap_present: false +; CHECK-NEXT: .user_data_reg_map: +; CHECK-NEXT: - 0x10000000 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: .user_sgprs: 0x3 +; CHECK-NEXT: .vgpr_count: 0x2 +; CHECK-NEXT: .vgpr_limit: 0x100 +; CHECK-NEXT: .wavefront_size: 0x40 +; CHECK-NEXT: .wgp_mode: false +; CHECK-NEXT: .gs: +; CHECK-NEXT: .debug_mode: false +; CHECK-NOT: .entry_point: _amdgpu_gs_main +; CHECK-NEXT: .entry_point_symbol: gs_shader +; CHECK-NEXT: .forward_progress: true +; GFX11-NEXT: .ieee_mode: false +; CHECK-NEXT: .lds_size: 0x200 +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: false +; CHECK-NEXT: .scratch_memory_size: 0 +; CHECK-NEXT: .sgpr_count: 0x1 +; CHECK-NEXT: .vgpr_count: 0x1 +; CHECK-NEXT: .wgp_mode: true +; CHECK-NEXT: .hs: +; CHECK-NEXT: .debug_mode: false +; CHECK-NOT: .entry_point: _amdgpu_hs_main +; CHECK-NEXT: .entry_point_symbol: hs_shader +; CHECK-NEXT: .forward_progress: true +; GFX11-NEXT: .ieee_mode: false +; CHECK-NEXT: .lds_size: 0x1000 +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: false +; CHECK-NEXT: .scratch_memory_size: 0 +; CHECK-NEXT: .sgpr_count: 0x1 +; CHECK-NEXT: .vgpr_count: 0x1 +; CHECK-NEXT: .wgp_mode: true +; CHECK-NEXT: .ps: +; CHECK-NEXT: .debug_mode: false +; CHECK-NOT: .entry_point: _amdgpu_ps_main +; CHECK-NEXT: .entry_point_symbol: ps_shader +; CHECK-NEXT: .forward_progress: true +; GFX11-NEXT: .ieee_mode: false +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: false +; CHECK-NEXT: .scratch_memory_size: 0 +; CHECK-NEXT: .sgpr_count: 0x1 +; CHECK-NEXT: .vgpr_count: 0x1 +; CHECK-NEXT: .wgp_mode: true +; CHECK: .registers: {} +; CHECK:amdpal.version: +; CHECK-NEXT: - 0x3 +; CHECK-NEXT: - 0x6 +; CHECK-NEXT:... +; CHECK-NEXT: .end_amdgpu_pal_metadata + +define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 { +.entry: + %i = call i64 @llvm.amdgcn.s.getpc() + %i1 = and i64 %i, -4294967296 + %i2 = zext i32 %arg1 to i64 + %i3 = or i64 %i1, %i2 + %i4 = inttoptr i64 %i3 to ptr addrspace(4) + %i5 = and i32 %arg2, 1023 + %i6 = lshr i32 %arg2, 10 + %i7 = and i32 %i6, 1023 + %i8 = add nuw nsw i32 %i7, %i5 + %i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16 + %.idx = shl nuw nsw i32 %i8, 2 + call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0) + ret void +} + +define dllexport amdgpu_ps void @ps_shader() #1 { + ret void +} + +@LDS.GS = external addrspace(3) global [1 x i32], align 4 + +define dllexport amdgpu_gs void @gs_shader() { + %ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0 + store i32 0, ptr addrspace(3) %ptr, align 4 + ret void +} + +@LDS.HS = external addrspace(3) global [1024 x i32], align 4 + +define dllexport amdgpu_hs void @hs_shader() { + %ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0 + store i32 0, ptr addrspace(3) %ptr, align 4 + ret void +} + +!amdgpu.pal.metadata.msgpack = !{!0} + +attributes #0 = { nounwind memory(readwrite) "target-features"=",+wavefrontsize64,+cumode" "amdgpu-dynamic-vgpr-block-size"="16" } + +attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" "amdgpu-dynamic-vgpr-block-size"="16" } + +!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\06"} +!1 = !{i32 7} diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll index 830872a..d2f26e8 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll @@ -1,17 +1,14 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11,NODVGPR -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK,NODVGPR -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr <%s | FileCheck %s --check-prefixes=CHECK,DVGPR +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11 +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK ; CHECK-LABEL: {{^}}_amdgpu_cs_main: -; NODVGPR: ; TotalNumSgprs: 4 -; DVGPR: ; TotalNumSgprs: 34 +; CHECK: ; TotalNumSgprs: 4 ; CHECK: ; NumVgprs: 2 ; CHECK: .amdgpu_pal_metadata ; CHECK-NEXT: --- ; CHECK-NEXT: amdpal.pipelines: ; CHECK-NEXT: - .api: Vulkan ; CHECK-NEXT: .compute_registers: -; DVGPR-NEXT: .dynamic_vgpr_en: true ; CHECK-NEXT: .tg_size_en: true ; CHECK-NEXT: .tgid_x_en: false ; CHECK-NEXT: .tgid_y_en: false @@ -57,7 +54,6 @@ ; CHECK-NEXT: .cs: ; CHECK-NEXT: .checksum_value: 0x9444d7d0 ; CHECK-NEXT: .debug_mode: false -; DVGPR-NEXT: .dynamic_vgpr_saved_count: 0x70 ; CHECK-NOT: .entry_point: _amdgpu_cs_main ; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main ; CHECK-NEXT: .excp_en: 0 @@ -69,8 +65,7 @@ ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false ; CHECK-NEXT: .scratch_memory_size: 0 -; NODVGPR-NEXT: .sgpr_count: 0x4 -; DVGPR-NEXT: .sgpr_count: 0x22 +; CHECK-NEXT: .sgpr_count: 0x4 ; CHECK-NEXT: .sgpr_limit: 0x6a ; CHECK-NEXT: .threadgroup_dimensions: ; CHECK-NEXT: - 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index b9e9893..9a23788 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -369,7 +369,7 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND @@ -378,73 +378,66 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[4:7] -; CHECK-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[10:11], s[0:1] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[12:13], v[4:7] -; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7] -; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v2 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a0, s0 +; CHECK-NEXT: v_accvgpr_write_b32 a1, s1 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7] +; CHECK-NEXT: s_nop 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; CHECK-NEXT: v_mov_b32_e32 v5, v4 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: v_mov_b32_e32 v7, v4 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11] ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[4:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[26:29], v[8:9], v[8:9], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[8:9], v[8:9], v[26:29] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7] ; CHECK-NEXT: s_nop 5 -; CHECK-NEXT: v_cvt_f16_f32_e32 v23, v14 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[18:21] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3] -; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_accvgpr_read_b32 v19, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v18, a2 -; CHECK-NEXT: v_mov_b64_e32 v[20:21], 0 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a0 -; CHECK-NEXT: v_cvt_f16_f32_e32 v15, v22 -; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[8:9], v[8:9], v[16:19] -; CHECK-NEXT: v_cvt_f16_f32_e32 v12, v0 -; CHECK-NEXT: global_store_short v[20:21], v23, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15] +; CHECK-NEXT: s_nop 2 +; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3] +; CHECK-NEXT: global_store_short v[12:13], v17, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[10:11], v[8:9], v[4:7] -; CHECK-NEXT: global_store_short v[20:21], v15, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7] +; CHECK-NEXT: global_store_short v[12:13], v9, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[20:21], v14, off -; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v16 +; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0 +; CHECK-NEXT: global_store_short v[12:13], v1, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[20:21], v14, off -; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CHECK-NEXT: global_store_short v[12:13], v14, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[20:21], v12, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11] +; CHECK-NEXT: s_nop 6 +; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7] +; CHECK-NEXT: global_store_short v[12:13], v8, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[20:21], v0, off +; CHECK-NEXT: s_nop 2 +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CHECK-NEXT: global_store_short v[12:13], v0, off ; CHECK-NEXT: s_endpgm entry: %k0 = call <4 x float> asm sideeffect "; def $0", "=s"() diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_alloc_failure_no_agpr_O0.ll b/llvm/test/CodeGen/AMDGPU/smfmac_alloc_failure_no_agpr_O0.ll new file mode 100644 index 0000000..ba0fdc68 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/smfmac_alloc_failure_no_agpr_O0.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx950 -amdgpu-mfma-vgpr-form=0 < %s | FileCheck %s +; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx950 -amdgpu-mfma-vgpr-form=1 < %s | FileCheck %s + +declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half>, <16 x half>, <16 x float>, i32, i32 immarg, i32 immarg) + +define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %arg, <8 x half> %a, <16 x half> %b, i32 %idx) #0 { +; CHECK-LABEL: test_smfmac_f32_32x32x32_f16__vgpr: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_mov_b64 s[2:3], s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_load_dword s2, s[2:3], 0x64 +; CHECK-NEXT: s_mov_b32 s3, 0x3ff +; CHECK-NEXT: v_and_b32_e64 v1, v1, s3 +; CHECK-NEXT: s_mov_b32 s3, 6 +; CHECK-NEXT: v_lshlrev_b32_e64 v8, s3, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, v7 +; CHECK-NEXT: v_mov_b32_e32 v2, v6 +; CHECK-NEXT: v_mov_b32_e32 v3, v5 +; CHECK-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; CHECK-NEXT: global_load_dwordx4 v[10:13], v8, s[0:1] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v5, v13 +; CHECK-NEXT: v_mov_b32_e32 v6, v12 +; CHECK-NEXT: v_mov_b32_e32 v7, v11 +; CHECK-NEXT: v_mov_b32_e32 v24, v10 +; CHECK-NEXT: global_load_dwordx4 v[10:13], v8, s[0:1] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v25, v13 +; CHECK-NEXT: v_mov_b32_e32 v26, v12 +; CHECK-NEXT: v_mov_b32_e32 v27, v11 +; CHECK-NEXT: v_mov_b32_e32 v28, v10 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v8, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v29, v11 +; CHECK-NEXT: v_mov_b32_e32 v30, v10 +; CHECK-NEXT: v_mov_b32_e32 v31, v9 +; CHECK-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; CHECK-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v9, v31 +; CHECK-NEXT: v_mov_b32_e32 v10, v30 +; CHECK-NEXT: v_mov_b32_e32 v11, v29 +; CHECK-NEXT: v_mov_b32_e32 v12, v28 +; CHECK-NEXT: v_mov_b32_e32 v13, v27 +; CHECK-NEXT: v_mov_b32_e32 v14, v26 +; CHECK-NEXT: v_mov_b32_e32 v15, v25 +; CHECK-NEXT: v_mov_b32_e32 v16, v24 +; CHECK-NEXT: v_mov_b32_e32 v17, v7 +; CHECK-NEXT: v_mov_b32_e32 v18, v6 +; CHECK-NEXT: v_mov_b32_e32 v19, v5 +; CHECK-NEXT: v_mov_b32_e32 v20, v4 +; CHECK-NEXT: v_mov_b32_e32 v21, v3 +; CHECK-NEXT: v_mov_b32_e32 v22, v2 +; CHECK-NEXT: v_mov_b32_e32 v23, v1 +; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[14:15] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[8:9] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[6:7] +; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_smfmac_f32_32x32x32_f16 v[8:23], v[2:5], v[24:31], v1 cbsz:1 abid:2 +; CHECK-NEXT: s_nop 11 +; CHECK-NEXT: v_mov_b32_e32 v1, v23 +; CHECK-NEXT: v_mov_b32_e32 v6, v22 +; CHECK-NEXT: v_mov_b32_e32 v7, v21 +; CHECK-NEXT: v_mov_b32_e32 v2, v20 +; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v3, v7 +; CHECK-NEXT: v_mov_b32_e32 v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 +; CHECK-NEXT: v_mov_b32_e32 v1, v19 +; CHECK-NEXT: v_mov_b32_e32 v6, v18 +; CHECK-NEXT: v_mov_b32_e32 v7, v17 +; CHECK-NEXT: v_mov_b32_e32 v2, v16 +; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v3, v7 +; CHECK-NEXT: v_mov_b32_e32 v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 +; CHECK-NEXT: v_mov_b32_e32 v1, v15 +; CHECK-NEXT: v_mov_b32_e32 v6, v14 +; CHECK-NEXT: v_mov_b32_e32 v7, v13 +; CHECK-NEXT: v_mov_b32_e32 v2, v12 +; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v3, v7 +; CHECK-NEXT: v_mov_b32_e32 v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 +; CHECK-NEXT: v_mov_b32_e32 v1, v11 +; CHECK-NEXT: v_mov_b32_e32 v6, v10 +; CHECK-NEXT: v_mov_b32_e32 v7, v9 +; CHECK-NEXT: v_mov_b32_e32 v2, v8 +; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v3, v7 +; CHECK-NEXT: v_mov_b32_e32 v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; CHECK-NEXT: s_endpgm +bb: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id + %in.1 = load <16 x float>, ptr addrspace(1) %gep + %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %a, <16 x half> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2) + store <16 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" } |