diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 174 |
1 files changed, 78 insertions, 96 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 04ee0bb..f78ea92 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -1485,30 +1485,30 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v12, s0 -; SDAG-NEXT: v_mov_b32_e32 v13, s1 -; SDAG-NEXT: v_mov_b32_e32 v14, s2 -; SDAG-NEXT: v_mov_b32_e32 v15, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: v_mov_b32_e32 v20, s28 -; SDAG-NEXT: v_mov_b32_e32 v21, s29 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_mov_b32_e32 v8, s24 -; SDAG-NEXT: v_mov_b32_e32 v9, s25 -; SDAG-NEXT: v_mov_b32_e32 v10, s26 -; SDAG-NEXT: v_mov_b32_e32 v11, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v21 +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v6, s20 +; SDAG-NEXT: v_mov_b32_e32 v7, s21 +; SDAG-NEXT: v_mov_b32_e32 v8, s22 +; SDAG-NEXT: v_mov_b32_e32 v9, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s24 +; SDAG-NEXT: v_mov_b32_e32 v11, s25 +; SDAG-NEXT: v_mov_b32_e32 v12, s26 +; SDAG-NEXT: v_mov_b32_e32 v13, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v4 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v5 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1895,7 +1895,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 @@ -1915,16 +1915,16 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-NEXT: v_mov_b32_e32 v14, s22 ; SDAG-NEXT: v_mov_b32_e32 v15, s23 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s12, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: @@ -1937,20 +1937,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s24 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s25 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s26 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s29 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b32_e32 v20, s29 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s28, v20 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -1964,7 +1962,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_movk_i32 s6, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 @@ -1974,7 +1972,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v5, s13 ; SDAG-NEXT: v_mov_b32_e32 v6, s14 ; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; SDAG-NEXT: v_mov_b32_e32 v8, s16 ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 @@ -1983,21 +1981,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v13, s21 ; SDAG-NEXT: v_mov_b32_e32 v14, s22 ; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] @@ -2005,19 +2001,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2031,7 +2025,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_movk_i32 s6, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 @@ -2041,7 +2035,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v5, s13 ; SDAG-NEXT: v_mov_b32_e32 v6, s14 ; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; SDAG-NEXT: v_mov_b32_e32 v8, s16 ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 @@ -2050,21 +2044,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v13, s21 ; SDAG-NEXT: v_mov_b32_e32 v14, s22 ; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] @@ -2072,19 +2064,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2096,7 +2086,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s8 @@ -2107,7 +2097,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v5, s13 ; SDAG-NEXT: v_mov_b32_e32 v6, s14 ; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; SDAG-NEXT: v_mov_b32_e32 v8, s16 ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 @@ -2116,14 +2106,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v13, s21 ; SDAG-NEXT: v_mov_b32_e32 v14, s22 ; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm: @@ -2136,21 +2124,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2162,7 +2148,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s8 @@ -2173,7 +2159,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v5, s13 ; SDAG-NEXT: v_mov_b32_e32 v6, s14 ; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; SDAG-NEXT: v_mov_b32_e32 v8, s16 ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 @@ -2182,14 +2168,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v13, s21 ; SDAG-NEXT: v_mov_b32_e32 v14, s22 ; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal: @@ -2202,21 +2186,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2559,5 +2541,5 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } |