diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll | 9 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll | 11 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fmed3.ll | 275 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll | 1 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 1854 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 96 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 204 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/minmax.ll | 24 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/readsteadycounter.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 239 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir | 32 |
11 files changed, 1363 insertions, 1386 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll index 26b9d99..8705647 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll @@ -206,7 +206,7 @@ define <2 x half> @test_max_K0min_K1Val_v2f16(<2 x half> %a) #1 { ; global nnan function attribute always forces clamp combine -define float @test_min_max_global_nnan(float %a) #3 { +define float @test_min_max_global_nnan(float %a) { ; GFX10-LABEL: test_min_max_global_nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -223,11 +223,11 @@ define float @test_min_max_global_nnan(float %a) #3 { ; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call float @llvm.maxnum.f32(float %a, float 0.0) - %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0) + %fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 1.0) ret float %fmed } -define float @test_max_min_global_nnan(float %a) #3 { +define float @test_max_min_global_nnan(float %a) { ; GFX10-LABEL: test_max_min_global_nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -244,7 +244,7 @@ define float @test_max_min_global_nnan(float %a) #3 { ; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call float @llvm.minnum.f32(float %a, float 1.0) - %fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0) + %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 0.0) ret float %fmed } @@ -414,5 +414,4 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) attributes #0 = {"amdgpu-ieee"="true"} attributes #1 = {"amdgpu-ieee"="false"} attributes #2 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="true"} -attributes #3 = {"no-nans-fp-math"="true"} attributes #4 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="false"} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll index d2c93e7..696a87b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll @@ -232,7 +232,7 @@ define half @test_max_K0min_K1Val_f16(half %a) #1 { ; global nnan function attribute always forces fmed3 combine -define float @test_min_max_global_nnan(float %a) #2 { +define float @test_min_max_global_nnan(float %a) { ; GFX10-LABEL: test_min_max_global_nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -254,12 +254,12 @@ define float @test_min_max_global_nnan(float %a) #2 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0) + %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0) %fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0) ret float %fmed } -define float @test_max_min_global_nnan(float %a) #2 { +define float @test_max_min_global_nnan(float %a) { ; GFX10-LABEL: test_max_min_global_nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -281,8 +281,8 @@ define float @test_max_min_global_nnan(float %a) #2 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %minnum = call float @llvm.minnum.f32(float %a, float 4.0) - %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0) + %minnum = call nnan float @llvm.minnum.f32(float %a, float 4.0) + %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 2.0) ret float %fmed } @@ -560,4 +560,3 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) attributes #0 = {"amdgpu-ieee"="true"} attributes #1 = {"amdgpu-ieee"="false"} -attributes #2 = {"no-nans-fp-math"="true"} diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 9e15225..3145a27 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -10,7 +10,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s -define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -120,7 +120,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ret void } -define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -231,7 +231,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ret void } -define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -342,7 +342,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -453,7 +453,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -569,7 +569,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ret void } -define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -740,7 +740,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ret void } -define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f64: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -955,14 +955,14 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid %a = load float, ptr addrspace(1) %gep0 - %max = call float @llvm.maxnum.f32(float %a, float 2.0) - %med = call float @llvm.minnum.f32(float %max, float 4.0) + %max = call nnan float @llvm.maxnum.f32(float %a, float 2.0) + %med = call nnan float @llvm.minnum.f32(float %max, float 4.0) store float %med, ptr addrspace(1) %outgep ret void } -define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -1297,10 +1297,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 %a.fneg = fsub float -0.0, %a - %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -1487,10 +1487,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 %b.fneg = fsub float -0.0, %b - %tmp0 = call float @llvm.minnum.f32(float %a, float %b.fneg) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b.fneg) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b.fneg) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b.fneg) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -1677,10 +1677,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 %c.fneg = fsub float -0.0, %c - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fneg) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fneg) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -1872,14 +1872,14 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs %c = load volatile float, ptr addrspace(1) %gep2 %a.fneg = fsub float -0.0, %a - %b.fabs = call float @llvm.fabs.f32(float %b) - %c.fabs = call float @llvm.fabs.f32(float %c) + %b.fabs = call nnan float @llvm.fabs.f32(float %b) + %c.fabs = call nnan float @llvm.fabs.f32(float %c) %c.fabs.fneg = fsub float -0.0, %c.fabs - %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs) - %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b.fabs) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b.fabs) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void @@ -2082,16 +2082,16 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs %c.fabs = call float @llvm.fabs.f32(float %c) %c.fabs.fneg = fsub float -0.0, %c.fabs - %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) - %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } -define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ret void } -define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -2418,7 +2418,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ret void } -define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_call_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -2570,7 +2570,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ret void } -define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_fast_call_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -2878,10 +2878,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3030,10 +3030,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3220,10 +3220,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 %a.fneg = fsub float -0.0, %a - %tmp0 = call float @llvm.maxnum.f32(float %a.fneg, float %b) - %tmp1 = call float @llvm.minnum.f32(float %a.fneg, float %b) - %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b) + %tmp1 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b) + %tmp2 = call nnan float @llvm.maxnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.minnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3372,10 +3372,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3524,10 +3524,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3676,10 +3676,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3828,10 +3828,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3980,10 +3980,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -4132,10 +4132,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -4284,10 +4284,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -4436,10 +4436,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -4588,10 +4588,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -4740,10 +4740,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -4892,10 +4892,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -5044,10 +5044,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -5196,10 +5196,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -5348,10 +5348,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -5503,10 +5503,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp1 = call float @llvm.minnum.f32(float %a, float %b) - %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.maxnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.minnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -5515,7 +5515,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; Negative patterns ; --------------------------------------------------------------------- -define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -5717,7 +5717,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -5944,7 +5944,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -6146,7 +6146,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -6352,7 +6352,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ret void } -define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -6527,7 +6527,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -6702,7 +6702,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -6877,7 +6877,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -7270,10 +7270,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 %a.fneg = fsub float -0.0, %a - %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -7428,13 +7428,13 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %max = call float @llvm.maxnum.f32(float %a, float %b) - %minmax = call float @llvm.minnum.f32(float %max, float %c) + %max = call nnan float @llvm.maxnum.f32(float %a, float %b) + %minmax = call nnan float @llvm.minnum.f32(float %max, float %c) store float %minmax, ptr addrspace(1) %outgep ret void } -define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -7597,7 +7597,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ret void } -define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -7865,7 +7865,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ret void } -define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: two_non_inline_constant: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -7998,7 +7998,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad } ; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants. -define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: one_non_inline_constant: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -8137,7 +8137,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ret void } -define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: two_non_inline_constant_multi_use: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -8343,7 +8343,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ret void } -define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 { +define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) { ; SI-LABEL: v_test_fmed3_r_i_i_f32_minimumnum_maximumnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8384,7 +8384,7 @@ define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 { ret float %med } -define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %a) #1 { +define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %a) { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8452,7 +8452,7 @@ define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> % ret <2 x float> %med } -define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(float %a) #1 { +define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(float %a) { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8525,7 +8525,7 @@ define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use( ret { float, float } %ins.1 } -define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 { +define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) { ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8567,7 +8567,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) ret float %med } -define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) #1 { +define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) { ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8609,7 +8609,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) ret float %med } -define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) #1 { +define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) { ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8651,7 +8651,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) #1 { ret float %med } -define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) #1 { +define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) { ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8693,7 +8693,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) #1 { ret float %med } -define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 { +define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8772,7 +8772,7 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 { ret half %med } -define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) #1 { +define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8848,7 +8848,7 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) ret <2 x half> %med } -define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) #1 { +define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) { ; SI-LABEL: v_test_fmed3_r_i_i_f64_minimumnum_maximumnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8905,5 +8905,4 @@ declare half @llvm.minnum.f16(half, half) #0 declare half @llvm.maxnum.f16(half, half) #0 attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index 6b09424..eee232a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -49,7 +49,6 @@ bb: ret void } -; FIXME: This generates "instid1(/* invalid instid value */)". define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) { ; GFX11-LABEL: f2: ; GFX11: ; %bb.0: ; %bb diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index b07dec3..689d147 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -6,1153 +6,1147 @@ define amdgpu_kernel void @largeInterleave() #0 { ret void } ; GCN-LABEL: largeInterleave: ; GCN: ; %bb.0: - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: ; implicit-def: $vgpr0 - ; GCN-NEXT: ; implicit-def: $vgpr2 - ; GCN-NEXT: ; implicit-def: $vgpr1 - ; GCN-NEXT: ; implicit-def: $vgpr8 - ; GCN-NEXT: ; implicit-def: $vgpr94 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ; implicit-def: $vgpr106 - ; GCN-NEXT: ; implicit-def: $vgpr132 - ; GCN-NEXT: ; implicit-def: $vgpr133 - ; GCN-NEXT: ; implicit-def: $vgpr139 - ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 - ; GCN-NEXT: ; iglp_opt mask(0x00000002) - ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: ; implicit-def: $vgpr16 + ; GCN-NEXT: ; implicit-def: $vgpr25 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) - ; GCN-NEXT: v_readfirstlane_b32 s7, v0 + ; GCN-NEXT: v_readfirstlane_b32 s17, v16 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr17 + ; GCN-NEXT: ; implicit-def: $sgpr15 ; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: ; implicit-def: $sgpr5 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_lshl_add_u32 v0, s7, 4, v2 - ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 - ; GCN-NEXT: v_add_lshl_u32 v92, v0, v1, 1 - ; GCN-NEXT: v_add_u32_e32 v93, s0, v92 - ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: s_lshl_b32 s18, s17, 7 + ; GCN-NEXT: ; implicit-def: $vgpr18 + ; GCN-NEXT: v_add_lshl_u32 v230, v18, s18, 1 + ; GCN-NEXT: v_lshl_add_u32 v25, s17, 4, v25 + ; GCN-NEXT: v_mul_lo_u32 v25, v25, s6 + ; GCN-NEXT: v_add_lshl_u32 v226, v25, v17, 1 + ; GCN-NEXT: v_add_u32_e32 v17, s15, v226 + ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v17, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: s_lshl_b32 s0, s7, 7 - ; GCN-NEXT: v_add_lshl_u32 v95, v8, s0, 1 - ; GCN-NEXT: v_add_u32_e32 v8, 64, v93 - ; GCN-NEXT: ; kill: killed $vgpr8 + ; GCN-NEXT: v_add_u32_e32 v72, 64, v17 + ; GCN-NEXT: ; implicit-def: $vgpr213 + ; GCN-NEXT: ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155 + ; GCN-NEXT: ; implicit-def: $vgpr246 + ; GCN-NEXT: v_add_u32_e32 v188, 0x80, v17 + ; GCN-NEXT: ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159 + ; GCN-NEXT: ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147 + ; GCN-NEXT: ; implicit-def: $vgpr19 + ; GCN-NEXT: ; implicit-def: $vgpr26 + ; GCN-NEXT: ; implicit-def: $vgpr27 + ; GCN-NEXT: v_add_u32_e32 v227, 0xc0, v17 + ; GCN-NEXT: v_add_u32_e32 v231, v19, v26 + ; GCN-NEXT: v_add_u32_e32 v232, v19, v27 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; kill: killed $vgpr92 - ; GCN-NEXT: ; implicit-def: $sgpr6 + ; GCN-NEXT: ; implicit-def: $vgpr28 + ; GCN-NEXT: ; implicit-def: $vgpr29 + ; GCN-NEXT: v_add_u32_e32 v233, v19, v28 + ; GCN-NEXT: v_add_u32_e32 v234, v19, v29 + ; GCN-NEXT: ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: ; implicit-def: $sgpr7 + ; GCN-NEXT: ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151 + ; GCN-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139 + ; GCN-NEXT: ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135 + ; GCN-NEXT: ; implicit-def: $vgpr20 + ; GCN-NEXT: v_add_u32_e32 v18, s17, v20 + ; GCN-NEXT: v_and_b32_e32 v18, 0x1fffffff, v18 + ; GCN-NEXT: ; implicit-def: $sgpr16 + ; GCN-NEXT: v_mul_lo_u32 v18, v18, s16 + ; GCN-NEXT: ; implicit-def: $vgpr21 + ; GCN-NEXT: v_add_lshl_u32 v199, v21, v18, 1 + ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: v_lshl_add_u32 v200, v22, 1, v199 + ; GCN-NEXT: ; implicit-def: $vgpr23 + ; GCN-NEXT: v_lshl_add_u32 v201, v23, 1, v200 + ; GCN-NEXT: ; implicit-def: $vgpr24 + ; GCN-NEXT: v_lshl_add_u32 v202, v24, 1, v201 + ; GCN-NEXT: ; implicit-def: $vgpr16 + ; GCN-NEXT: ; implicit-def: $vgpr18 + ; GCN-NEXT: ; implicit-def: $vgpr20 + ; GCN-NEXT: ; implicit-def: $vgpr24 + ; GCN-NEXT: v_add_u32_e32 v247, v19, v24 + ; GCN-NEXT: v_add_u32_e32 v248, v19, v16 + ; GCN-NEXT: v_add_u32_e32 v249, v19, v18 + ; GCN-NEXT: v_add_u32_e32 v250, v19, v20 + ; GCN-NEXT: ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131 + ; GCN-NEXT: ; implicit-def: $sgpr14 + ; GCN-NEXT: ; implicit-def: $vgpr196 + ; GCN-NEXT: ; implicit-def: $sgpr12_sgpr13 + ; GCN-NEXT: ; implicit-def: $vgpr211 + ; GCN-NEXT: v_max_f32_e32 v212, v211, v211 + ; GCN-NEXT: ; implicit-def: $vgpr198 + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr32 + ; GCN-NEXT: ; implicit-def: $vgpr33 + ; GCN-NEXT: ; implicit-def: $vgpr34 + ; GCN-NEXT: v_add_u32_e32 v210, v19, v34 + ; GCN-NEXT: v_add_u32_e32 v206, v19, v33 + ; GCN-NEXT: v_add_u32_e32 v205, v19, v32 + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN-NEXT: ; implicit-def: $vgpr21 + ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: ; implicit-def: $vgpr23 + ; GCN-NEXT: ; implicit-def: $vgpr30 + ; GCN-NEXT: ; implicit-def: $vgpr31 + ; GCN-NEXT: v_add_u32_e32 v207, v19, v21 + ; GCN-NEXT: v_add_u32_e32 v208, v19, v22 + ; GCN-NEXT: v_add_u32_e32 v209, v19, v23 + ; GCN-NEXT: v_add_u32_e32 v203, v19, v30 + ; GCN-NEXT: v_add_u32_e32 v204, v19, v31 + ; GCN-NEXT: ; kill: killed $vgpr17 + ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; GCN-NEXT: ; implicit-def: $vgpr197 + ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[0:3] + ; GCN-NEXT: ds_write_b128 v230, v[64:67] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[4:7] offset:1024 + ; GCN-NEXT: ds_write_b128 v230, v[68:71] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[64:67], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127] + ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111] + ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0 - ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[168:171], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: ds_read_b128 v[172:175], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[84:87], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[180:183], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:1536 + ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_add_u32_e32 v72, 0x80, v93 + ; GCN-NEXT: ds_write_b128 v230, v[160:163] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v230, v[164:167] offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:128 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v188, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[188:191], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: ds_read_b128 v[192:195], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[164:167], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[214:217], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127] + ; GCN-NEXT: ds_read_b128 v[218:221], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[222:225], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[168:171], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[188:189], v[144:145], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[190:191], v[146:147], v[112:127] + ; GCN-NEXT: ds_read_b128 v[188:191], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: ds_write_b128 v230, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 - ; GCN-NEXT: ; implicit-def: $vgpr64 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 - ; GCN-NEXT: ; implicit-def: $vgpr73 - ; GCN-NEXT: v_add_u32_e32 v76, v132, v64 + ; GCN-NEXT: ds_write_b128 v230, v[160:163] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79] + ; GCN-NEXT: buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v73 - ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr74 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v74 - ; GCN-NEXT: ; implicit-def: $vgpr75 - ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v75 - ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79] + ; GCN-NEXT: v_perm_b32 v238, v162, v160, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127] + ; GCN-NEXT: v_perm_b32 v240, v162, v160, s7 + ; GCN-NEXT: v_perm_b32 v242, v163, v161, s5 + ; GCN-NEXT: v_perm_b32 v244, v163, v161, s7 + ; GCN-NEXT: ds_read_b128 v[160:163], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr76 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ; implicit-def: $sgpr8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512 + ; GCN-NEXT: v_perm_b32 v239, v174, v172, s5 + ; GCN-NEXT: v_perm_b32 v241, v174, v172, s7 + ; GCN-NEXT: v_perm_b32 v243, v175, v173, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79] + ; GCN-NEXT: v_perm_b32 v245, v175, v173, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127] + ; GCN-NEXT: ds_read_b128 v[218:221], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[172:175], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[146:147], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[178:179], v[158:159], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[148:149], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[140:141], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[192:193], v[144:145], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[150:151], v[112:127] + ; GCN-NEXT: ds_read_b128 v[160:163], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: ds_read_b128 v[184:187], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[214:217], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[142:143], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[146:147], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[148:149], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[156:157], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[140:141], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[150:151], v[64:79] + ; GCN-NEXT: ds_read_b128 v[160:163], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: ds_write_b128 v230, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v230, v[226:229] offset:1024 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[158:159], v[80:95] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[64:67], v94 + ; GCN-NEXT: ds_read_b128 v[156:159], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[90:93], v94 offset:512 + ; GCN-NEXT: ds_read_b128 v[226:229], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71 - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[180:183], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ds_read_b128 v[76:79], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[152:155], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[94:97], v106 + ; GCN-NEXT: ds_read_b128 v[230:233], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63] - ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47] - ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[234:237], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127] + ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[224:225], v[142:143], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[148:149], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127] + ; GCN-NEXT: ds_read_b128 v[156:159], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63] - ; GCN-NEXT: v_perm_b32 v94, v102, v98, s5 - ; GCN-NEXT: v_perm_b32 v98, v102, v98, s8 - ; GCN-NEXT: v_perm_b32 v102, v103, v99, s5 - ; GCN-NEXT: v_perm_b32 v95, v104, v100, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63] - ; GCN-NEXT: v_perm_b32 v96, v103, v99, s8 - ; GCN-NEXT: v_perm_b32 v99, v104, v100, s8 - ; GCN-NEXT: v_perm_b32 v103, v105, v101, s5 - ; GCN-NEXT: v_perm_b32 v97, v105, v101, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47] - ; GCN-NEXT: s_nop 5 - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v48 - ; GCN-NEXT: v_mul_f32_e32 v101, s4, v49 - ; GCN-NEXT: v_max3_f32 v92, v100, s6, v101 - ; GCN-NEXT: v_mul_f32_e32 v93, s4, v50 - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v51 - ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 - ; GCN-NEXT: v_mul_f32_e32 v93, s4, v52 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31] - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v53 - ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v54 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v55 - ; GCN-NEXT: v_max3_f32 v84, v92, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v56 - ; GCN-NEXT: v_mul_f32_e32 v92, s4, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15] - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v92 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v58 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v59 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v60 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v61 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v62 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v63 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31] - ; GCN-NEXT: s_nop 6 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v32 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v33 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v34 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v35 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v37 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v38 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v39 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v40 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v41 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31] - ; GCN-NEXT: v_max3_f32 v80, v84, v85, v80 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v42 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v43 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v44 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v45 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v46 - ; GCN-NEXT: v_mul_f32_e32 v82, s4, v47 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v16 - ; GCN-NEXT: v_mul_f32_e32 v82, s4, v17 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 - ; GCN-NEXT: v_max3_f32 v68, v80, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v20 - ; GCN-NEXT: v_mul_f32_e32 v76, s4, v21 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v76 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v22 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v23 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v24 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v25 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v26 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v27 - ; GCN-NEXT: v_max3_f32 v64, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v28 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v29 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v30 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v31 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v0 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v2 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v3 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v4 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v5 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v7 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v8 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v10 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v11 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v12 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v13 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v14 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v15 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: ; implicit-def: $vgpr65 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $vgpr68 - ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_add_u32_e32 v65, s7, v65 - ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65 - ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6 - ; GCN-NEXT: v_add_lshl_u32 v135, v66, v65, 1 - ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_lshl_add_u32 v136, v66, 1, v135 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_lshl_add_u32 v137, v66, 1, v136 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 - ; GCN-NEXT: v_lshl_add_u32 v138, v66, 1, v137 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v135, v[94:95] - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v64, v64, v65 - ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64 + ; GCN-NEXT: ds_write_b64 v199, v[238:239] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[98:99] + ; GCN-NEXT: ds_write_b64 v200, v[240:241] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[102:103] + ; GCN-NEXT: ds_write_b64 v201, v[242:243] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[96:97] - ; GCN-NEXT: v_add_u32_e32 v68, v132, v68 - ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7] - ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 - ; GCN-NEXT: ; implicit-def: $vgpr65 - ; GCN-NEXT: v_max_f32_e32 v66, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v134, v66, v64 - ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: ds_write_b64 v202, v[244:245] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v64 - ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[150:151], v[96:111] + ; GCN-NEXT: buffer_load_dwordx2 v[194:195], v248, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v66 - ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[218:219], v249, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v67 - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[220:221], v250, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 - ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134 - ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 - ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134 - ; GCN-NEXT: v_exp_f32_e32 v163, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 - ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134 - ; GCN-NEXT: v_exp_f32_e32 v164, v57 - ; GCN-NEXT: v_exp_f32_e32 v49, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134 - ; GCN-NEXT: v_exp_f32_e32 v50, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v66 - ; GCN-NEXT: v_fma_f32 v68, s4, v52, -v134 - ; GCN-NEXT: v_exp_f32_e32 v51, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_fma_f32 v69, s4, v53, -v134 - ; GCN-NEXT: v_exp_f32_e32 v52, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v68 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_fma_f32 v70, s4, v54, -v134 - ; GCN-NEXT: v_exp_f32_e32 v53, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v69 - ; GCN-NEXT: v_fma_f32 v71, s4, v55, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v54, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70 - ; GCN-NEXT: v_exp_f32_e32 v55, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71 - ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134 - ; GCN-NEXT: v_exp_f32_e32 v56, v48 - ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49 - ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50 - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51 - ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 - ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v48, v48 - ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58 - ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67 - ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66 - ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55 - ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56 - ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 - ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 - ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] - ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48 - ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48 - ; GCN-NEXT: v_mul_f32_e64 v84, v84, v48 - ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48 - ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48 - ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48 - ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 - ; GCN-NEXT: v_exp_f32_e32 v58, v58 - ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95] - ; GCN-NEXT: v_mul_f32_e64 v98, v98, v48 - ; GCN-NEXT: v_mul_f32_e64 v99, v99, v48 - ; GCN-NEXT: v_mul_f32_e64 v100, v100, v48 - ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48 - ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48 - ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48 - ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59 - ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53 - ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54 - ; GCN-NEXT: v_exp_f32_e32 v59, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111] - ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134 - ; GCN-NEXT: v_mul_f32_e64 v112, v112, v48 - ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48 - ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48 - ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48 - ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134 - ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134 - ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60 - ; GCN-NEXT: ; implicit-def: $vgpr57 - ; GCN-NEXT: ds_read_b128 v[60:63], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v160, v149 - ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148 - ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79] - ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134 - ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134 - ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134 - ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162 - ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163 - ; GCN-NEXT: v_exp_f32_e32 v162, v146 - ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164 - ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134 - ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111] - ; GCN-NEXT: v_exp_f32_e32 v151, v33 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59 - ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 - ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134 - ; GCN-NEXT: v_exp_f32_e32 v153, v33 - ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134 - ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5 - ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161 - ; GCN-NEXT: v_exp_f32_e32 v165, v60 - ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8 - ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v161, v61 - ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8 - ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5 - ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8 - ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8 + ; GCN-NEXT: v_perm_b32 v188, v194, v192, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[164:165], v[144:145], v[80:95] + ; GCN-NEXT: v_perm_b32 v189, v220, v218, s5 + ; GCN-NEXT: v_perm_b32 v191, v220, v218, s7 + ; GCN-NEXT: v_perm_b32 v190, v194, v192, s7 + ; GCN-NEXT: v_perm_b32 v192, v195, v193, s5 + ; GCN-NEXT: v_perm_b32 v194, v195, v193, s7 + ; GCN-NEXT: v_perm_b32 v193, v221, v219, s5 + ; GCN-NEXT: v_perm_b32 v195, v221, v219, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[136:137], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[150:151], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[138:139], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[176:177], v[136:137], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[226:227], v[132:133], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[178:179], v[138:139], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[136:137], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[128:129], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[134:135], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[132:133], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[138:139], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[232:233], v[130:131], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[234:235], v[128:129], v[96:111] + ; GCN-NEXT: s_nop 9 + ; GCN-NEXT: v_mul_f32_e32 v213, s4, v112 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v113 + ; GCN-NEXT: v_max3_f32 v213, v213, s14, v218 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v114 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v115 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v116 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v117 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v118 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v119 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v120 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v121 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79] + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v122 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v123 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v124 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v125 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111] + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v126 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v127 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95] + ; GCN-NEXT: s_nop 6 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v96 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v97 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v98 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v99 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v100 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v101 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v102 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v103 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v104 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95] + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v106 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v107 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v108 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v109 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v110 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v111 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v140, s4, v80 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v81 + ; GCN-NEXT: v_max3_f32 v140, v213, v140, v141 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v82 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v83 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v84 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v85 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v86 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v87 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v88 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v89 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v90 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v91 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v92 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v93 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v94 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v95 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v128, s4, v64 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v65 + ; GCN-NEXT: v_max3_f32 v128, v140, v128, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v66 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v67 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v68 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v69 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v70 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v71 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v72 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v73 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v74 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v75 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v76 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v77 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v78 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v79 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_max_f32_e32 v129, v129, v129 + ; GCN-NEXT: v_max_f32_e32 v128, v128, v129 + ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v128, v129, v128, s[12:13] + ; GCN-NEXT: v_max_f32_e32 v128, v128, v128 + ; GCN-NEXT: v_max_f32_e32 v128, v212, v128 + ; GCN-NEXT: v_fma_f32 v113, s4, v113, -v128 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v114, -v128 + ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v115, -v128 + ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v116, -v128 + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v117, -v128 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v118, -v128 + ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v119, -v128 + ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128 + ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112 + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120 + ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128 + ; GCN-NEXT: v_exp_f32_e32 v114, v138 + ; GCN-NEXT: v_exp_f32_e32 v115, v139 + ; GCN-NEXT: v_exp_f32_e32 v116, v140 + ; GCN-NEXT: v_exp_f32_e32 v117, v141 + ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v118 + ; GCN-NEXT: v_exp_f32_e32 v118, v142 + ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v120 + ; GCN-NEXT: v_exp_f32_e32 v120, v144 + ; GCN-NEXT: v_exp_f32_e32 v113, v112 + ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114 + ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116 + ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v122, s4, v123, -v128 + ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119 + ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115 + ; GCN-NEXT: v_mul_f32_e32 v151, 0x3fb8aa3b, v122 + ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117 + ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128 + ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121 + ; GCN-NEXT: v_exp_f32_e32 v112, v129 + ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122 + ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128 + ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v119, v143 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47] + ; GCN-NEXT: v_mul_f32_e64 v20, v20, v112 + ; GCN-NEXT: v_mul_f32_e64 v21, v21, v112 + ; GCN-NEXT: v_mul_f32_e64 v22, v22, v112 + ; GCN-NEXT: v_mul_f32_e64 v23, v23, v112 + ; GCN-NEXT: v_mul_f32_e64 v24, v24, v112 + ; GCN-NEXT: v_mul_f32_e64 v25, v25, v112 + ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pack_b32_f16 v134, v123, v124 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v119 + ; GCN-NEXT: v_fma_f32 v124, s4, v126, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v120 + ; GCN-NEXT: v_exp_f32_e32 v121, v148 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v122, v149 + ; GCN-NEXT: v_pack_b32_f16 v135, v130, v126 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v124 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v121 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125 + ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128 + ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v123, v150 + ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_fma_f32 v143, s4, v101, -v128 + ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128 + ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128 + ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128 + ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v124, v151 + ; GCN-NEXT: ds_read_b128 v[130:133], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v122 + ; GCN-NEXT: v_exp_f32_e32 v96, v129 + ; GCN-NEXT: v_fma_f32 v137, s4, v97, -v128 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v97, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_fma_f32 v137, s4, v98, -v128 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v124 + ; GCN-NEXT: v_fma_f32 v135, s4, v99, -v128 + ; GCN-NEXT: v_exp_f32_e32 v98, v138 + ; GCN-NEXT: v_exp_f32_e32 v99, v127 + ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_pack_b32_f16 v127, v136, v134 + ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15] + ; GCN-NEXT: v_fma_f32 v131, s4, v100, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v96 + ; GCN-NEXT: v_exp_f32_e32 v100, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v97 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111] - ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151 - ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153 - ; GCN-NEXT: v_exp_f32_e32 v159, v33 - ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38 - ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_exp_f32_e32 v152, v38 + ; GCN-NEXT: ds_write_b64 v199, v[188:189] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[60:61] + ; GCN-NEXT: ds_write_b64 v200, v[190:191] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[32:33] - ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: ; implicit-def: $vgpr38 + ; GCN-NEXT: ds_write_b64 v201, v[192:193] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[140:141] - ; GCN-NEXT: v_add_u32_e32 v38, v132, v38 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v33 + ; GCN-NEXT: ds_write_b64 v202, v[194:195] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v101, v125 + ; GCN-NEXT: v_pack_b32_f16 v146, v130, v131 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v98 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31] + ; GCN-NEXT: v_fma_f32 v134, s4, v102, -v128 + ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v134 + ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr36 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v36 - ; GCN-NEXT: ; implicit-def: $vgpr37 - ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v102, v142 + ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v37 - ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165 - ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156 - ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134 - ; GCN-NEXT: ds_read_b128 v[36:39], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v154, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 - ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134 - ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v155, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157 - ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161 - ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159 - ; GCN-NEXT: v_exp_f32_e32 v157, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142 - ; GCN-NEXT: v_exp_f32_e32 v146, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134 - ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40 - ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v147, v36 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v143, v36 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142 - ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157 - ; GCN-NEXT: v_exp_f32_e32 v156, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146 - ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32 - ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v129, v36 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147 - ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: ds_read_b128 v[36:39], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v142, v40 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143 - ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95] - ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v63, v40 - ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61 - ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134 - ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156 - ; GCN-NEXT: v_exp_f32_e32 v158, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129 - ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v128, v17 - ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8 - ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62 - ; GCN-NEXT: v_exp_f32_e32 v167, v36 - ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8 - ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v130, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158 - ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5 - ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v99 + ; GCN-NEXT: v_fma_f32 v127, s4, v103, -v128 + ; GCN-NEXT: v_exp_f32_e32 v103, v150 + ; GCN-NEXT: v_fma_f32 v139, s4, v105, -v128 + ; GCN-NEXT: v_pack_b32_f16 v147, v147, v126 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_perm_b32 v152, v135, v131, s5 + ; GCN-NEXT: v_perm_b32 v154, v135, v131, s7 + ; GCN-NEXT: v_fma_f32 v135, s4, v104, -v128 + ; GCN-NEXT: v_perm_b32 v126, v134, v130, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15] + ; GCN-NEXT: v_perm_b32 v150, v134, v130, s7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v100 + ; GCN-NEXT: v_exp_f32_e32 v104, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v101 + ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_perm_b32 v127, v144, v142, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47] + ; GCN-NEXT: v_pack_b32_f16 v148, v134, v135 + ; GCN-NEXT: v_fma_f32 v135, s4, v106, -v128 + ; GCN-NEXT: v_exp_f32_e32 v105, v125 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v102 + ; GCN-NEXT: v_perm_b32 v151, v144, v142, s7 + ; GCN-NEXT: v_perm_b32 v153, v145, v143, s5 + ; GCN-NEXT: v_perm_b32 v155, v145, v143, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v106, v156 + ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v103 + ; GCN-NEXT: v_fma_f32 v136, s4, v107, -v128 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_pack_b32_f16 v149, v134, v135 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63] + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v136 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v107, v138 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[148:149], v[0:15] + ; GCN-NEXT: v_fma_f32 v131, s4, v108, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v104 + ; GCN-NEXT: v_exp_f32_e32 v108, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47] + ; GCN-NEXT: v_fma_f32 v142, s4, v109, -v128 + ; GCN-NEXT: v_exp_f32_e32 v109, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v142 + ; GCN-NEXT: v_pack_b32_f16 v142, v130, v131 + ; GCN-NEXT: v_fma_f32 v131, s4, v110, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31] + ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v107 + ; GCN-NEXT: v_exp_f32_e32 v110, v156 + ; GCN-NEXT: v_fma_f32 v135, s4, v111, -v128 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_pack_b32_f16 v143, v130, v131 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v111, v146 + ; GCN-NEXT: v_fma_f32 v139, s4, v80, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v108 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v80, v129 + ; GCN-NEXT: ds_read_b128 v[130:133], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v109 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47] + ; GCN-NEXT: v_fma_f32 v144, s4, v81, -v128 + ; GCN-NEXT: v_exp_f32_e32 v81, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v144 + ; GCN-NEXT: v_pack_b32_f16 v144, v138, v139 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v110 + ; GCN-NEXT: v_fma_f32 v137, s4, v82, -v128 + ; GCN-NEXT: v_exp_f32_e32 v82, v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v111 + ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_fma_f32 v137, s4, v83, -v128 + ; GCN-NEXT: v_mul_f32_e32 v157, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v83, v135 + ; GCN-NEXT: v_pack_b32_f16 v145, v136, v134 + ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[20:21] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111] - ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5 - ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v140, v17 - ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5 + ; GCN-NEXT: ds_write_b64 v199, v[126:127] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60 - ; GCN-NEXT: v_exp_f32_e32 v144, v22 + ; GCN-NEXT: ds_write_b64 v200, v[150:151] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[16:17] - ; GCN-NEXT: ; implicit-def: $vgpr17 - ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: ds_write_b64 v201, v[152:153] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[42:43] - ; GCN-NEXT: v_add_u32_e32 v22, v132, v22 - ; GCN-NEXT: v_add_u32_e32 v17, v132, v17 - ; GCN-NEXT: ; implicit-def: $vgpr20 - ; GCN-NEXT: ; implicit-def: $vgpr21 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: ds_write_b64 v202, v[154:155] + ; GCN-NEXT: v_fma_f32 v127, s4, v84, -v128 + ; GCN-NEXT: v_exp_f32_e32 v84, v129 + ; GCN-NEXT: v_fma_f32 v130, s4, v85, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v80 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v85, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v130 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v81 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31] + ; GCN-NEXT: v_fma_f32 v134, s4, v86, -v128 + ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v134 + ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v203, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v20, v132, v20 - ; GCN-NEXT: v_add_u32_e32 v21, v132, v21 - ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44 - ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v132, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v82 + ; GCN-NEXT: v_exp_f32_e32 v86, v156 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v83 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167 - ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134 - ; GCN-NEXT: ds_read_b128 v[20:23], v139 + ; GCN-NEXT: v_fma_f32 v139, s4, v87, -v128 + ; GCN-NEXT: v_exp_f32_e32 v87, v157 + ; GCN-NEXT: v_pack_b32_f16 v127, v127, v138 + ; GCN-NEXT: v_fma_f32 v138, s4, v89, -v128 + ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15] + ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: v_perm_b32 v154, v135, v131, s5 + ; GCN-NEXT: v_perm_b32 v156, v135, v131, s7 + ; GCN-NEXT: v_fma_f32 v135, s4, v88, -v128 + ; GCN-NEXT: v_perm_b32 v150, v134, v130, s5 + ; GCN-NEXT: v_perm_b32 v152, v134, v130, s7 + ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v84 + ; GCN-NEXT: v_exp_f32_e32 v88, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v85 + ; GCN-NEXT: v_perm_b32 v151, v146, v142, s5 + ; GCN-NEXT: v_perm_b32 v153, v146, v142, s7 + ; GCN-NEXT: v_perm_b32 v155, v147, v143, s5 + ; GCN-NEXT: v_perm_b32 v157, v147, v143, s7 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v89, v125 + ; GCN-NEXT: v_pack_b32_f16 v146, v134, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v86 + ; GCN-NEXT: v_fma_f32 v135, s4, v90, -v128 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v138 + ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v90, v158 + ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v87 + ; GCN-NEXT: v_fma_f32 v127, s4, v91, -v128 + ; GCN-NEXT: v_exp_f32_e32 v91, v139 + ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_pack_b32_f16 v147, v134, v126 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] + ; GCN-NEXT: v_fma_f32 v130, s4, v92, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v88 + ; GCN-NEXT: v_exp_f32_e32 v92, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v130 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v89 + ; GCN-NEXT: v_fma_f32 v131, s4, v93, -v128 + ; GCN-NEXT: v_pack_b32_f16 v130, v126, v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v93, v125 + ; GCN-NEXT: v_fma_f32 v126, s4, v94, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v125, v90 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v126 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v91 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_fma_f32 v131, s4, v95, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v94, v148 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v95, v127 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v92 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_pack_b32_f16 v131, v125, v126 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v125, v129 + ; GCN-NEXT: ds_read_b128 v[132:135], v197 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[36:39], v139 offset:576 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_fma_f32 v65, s4, v66, -v128 + ; GCN-NEXT: v_exp_f32_e32 v126, v142 + ; GCN-NEXT: v_pack_b32_f16 v142, v127, v64 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v94 + ; GCN-NEXT: v_mul_f32_e32 v145, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v95 + ; GCN-NEXT: v_fma_f32 v66, s4, v67, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v127, v143 + ; GCN-NEXT: v_pack_b32_f16 v143, v64, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v129, v138 + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v66 + ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[136:139], v197 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v62, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130 - ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134 - ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134 - ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134 - ; GCN-NEXT: ; implicit-def: $sgpr0 - ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140 - ; GCN-NEXT: v_exp_f32_e32 v145, v16 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141 - ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46 - ; GCN-NEXT: v_exp_f32_e32 v35, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v34, s4, v27, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v24 - ; GCN-NEXT: ds_read_b128 v[24:27], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v46, v20 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v47 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v132 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v47, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v62 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v34 - ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145 - ; GCN-NEXT: v_exp_f32_e32 v141, v16 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35 - ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134 - ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16 - ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v33, v20 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: ds_read_b128 v[20:23], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v36, v24 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47 - ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95] - ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_exp_f32_e32 v39, v24 - ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37 - ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141 - ; GCN-NEXT: v_exp_f32_e32 v148, v1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33 - ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127] - ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134 - ; GCN-NEXT: v_exp_f32_e32 v34, v1 - ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8 - ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38 - ; GCN-NEXT: v_exp_f32_e32 v150, v20 - ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v38, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39 - ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134 - ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5 - ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[4:5] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111] - ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5 - ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_exp_f32_e32 v42, v1 - ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5 + ; GCN-NEXT: ds_write_b64 v199, v[150:151] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[20:21] + ; GCN-NEXT: ds_write_b64 v200, v[152:153] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v125 + ; GCN-NEXT: v_exp_f32_e32 v130, v158 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[0:1] + ; GCN-NEXT: ds_write_b64 v201, v[154:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[26:27] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6 - ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32 + ; GCN-NEXT: ds_write_b64 v202, v[156:157] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_pack_b32_f16 v16, v37, v28 - ; GCN-NEXT: v_fma_f32 v24, s4, v7, -v134 - ; GCN-NEXT: v_exp_f32_e32 v25, v6 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[4:7], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149 - ; GCN-NEXT: v_exp_f32_e32 v26, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150 - ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38 - ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v28, s4, v9, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v29, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v41 - ; GCN-NEXT: v_fma_f32 v30, s4, v10, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v42 - ; GCN-NEXT: v_exp_f32_e32 v31, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v17, v2, v0 - ; GCN-NEXT: v_pack_b32_f16 v16, v1, v27 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24 - ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134 - ; GCN-NEXT: v_exp_f32_e32 v19, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8 - ; GCN-NEXT: ds_read_b128 v[8:11], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v24, v4 - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v26 - ; GCN-NEXT: v_exp_f32_e32 v27, v4 - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95] - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 - ; GCN-NEXT: v_fma_f32 v21, s4, v13, -v134 - ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31 - ; GCN-NEXT: v_exp_f32_e32 v30, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 - ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v16, v4 - ; GCN-NEXT: v_pack_b32_f16 v0, v5, v20 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v12 - ; GCN-NEXT: v_exp_f32_e32 v18, v9 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21 - ; GCN-NEXT: v_exp_f32_e32 v21, v9 - ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: ds_read_b128 v[4:7], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95] - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30 - ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_exp_f32_e32 v2, v2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127] - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_exp_f32_e32 v10, v1 - ; GCN-NEXT: v_pack_b32_f16 v8, v17, v20 - ; GCN-NEXT: v_pack_b32_f16 v9, v3, v0 - ; GCN-NEXT: v_add_f32_e32 v3, 0, v49 - ; GCN-NEXT: v_add_f32_e32 v3, v50, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v51, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v52, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v53, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v54, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v55, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v56, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v58, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v164, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v59, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v162, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v151, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v165, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v161, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v159, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v152, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v154, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v157, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v147, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v156, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v63, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v158, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v128, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v130, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v140, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v144, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v132, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v62, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v145, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v35, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v46, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v47, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v33, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v36, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v39, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95] - ; GCN-NEXT: v_add_f32_e32 v3, v34, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 - ; GCN-NEXT: v_add_f32_e32 v3, v38, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v42, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v25, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v26, v3 - ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22 - ; GCN-NEXT: v_add_f32_e32 v3, v29, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v31, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95] - ; GCN-NEXT: v_add_f32_e32 v3, v19, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v24, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v27, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v30, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v16, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v18, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v21, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: v_add_f32_e32 v0, v2, v3 - ; GCN-NEXT: v_add_f32_e32 v4, v10, v0 - ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v126 + ; GCN-NEXT: v_exp_f32_e32 v131, v144 + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_fma_f32 v69, s4, v71, -v128 + ; GCN-NEXT: v_pack_b32_f16 v140, v132, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v129 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v127 + ; GCN-NEXT: v_exp_f32_e32 v132, v145 + ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128 + ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_fma_f32 v145, s4, v73, -v128 + ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v145 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v133, v141 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_pack_b32_f16 v141, v64, v68 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[68:71], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v143, s4, v72, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v72, v146 + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_cvt_f16_f32_e32 v143, v131 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pack_b32_f16 v64, v64, v143 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v73, v144 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v132 + ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128 + ; GCN-NEXT: v_exp_f32_e32 v74, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v133 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63] + ; GCN-NEXT: v_fma_f32 v138, s4, v75, -v128 + ; GCN-NEXT: v_exp_f32_e32 v75, v142 + ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v138 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] + ; GCN-NEXT: v_fma_f32 v68, s4, v76, -v128 + ; GCN-NEXT: v_exp_f32_e32 v76, v146 + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v73 + ; GCN-NEXT: v_fma_f32 v69, s4, v77, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v77, v147 + ; GCN-NEXT: v_pack_b32_f16 v134, v66, v68 + ; GCN-NEXT: v_fma_f32 v68, s4, v78, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v74 + ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v78, v67 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v76 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v75 + ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v128 + ; GCN-NEXT: v_exp_f32_e32 v79, v148 + ; GCN-NEXT: v_mul_f32_e32 v128, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_pack_b32_f16 v135, v66, v64 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v142, v146 + ; GCN-NEXT: ds_read_b128 v[68:71], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v137, v147 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v77 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v138, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v78 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] + ; GCN-NEXT: s_nop 10 + ; GCN-NEXT: v_exp_f32_e32 v52, v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v142 + ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v52 + ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v79 + ; GCN-NEXT: v_pack_b32_f16 v50, v51, v50 + ; GCN-NEXT: v_pack_b32_f16 v48, v139, v136 + ; GCN-NEXT: v_pack_b32_f16 v51, v54, v53 + ; GCN-NEXT: v_add_f32_e32 v53, 0, v113 + ; GCN-NEXT: v_add_f32_e32 v53, v114, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v115, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v116, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v117, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v118, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v119, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v120, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v121, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v122, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v123, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v124, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v96, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v97, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v98, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v99, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v100, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v101, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v102, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v103, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v104, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v105, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v106, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v107, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v108, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v109, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v110, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v111, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v80, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v81, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v82, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v83, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v84, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v85, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v86, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v87, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v88, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v89, v53 + ; GCN-NEXT: v_pack_b32_f16 v49, v140, v49 + ; GCN-NEXT: v_add_f32_e32 v53, v90, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v91, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15] + ; GCN-NEXT: v_add_f32_e32 v53, v92, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v93, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v94, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v95, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v125, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v126, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v127, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v129, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47] + ; GCN-NEXT: s_nop 9 + ; GCN-NEXT: v_add_f32_e32 v0, v130, v53 + ; GCN-NEXT: v_add_f32_e32 v0, v131, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v132, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v133, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v72, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v73, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v74, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v75, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v76, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v77, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v78, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v79, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v142, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v137, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v138, v0 + ; GCN-NEXT: v_add_f32_e32 v4, v52, v0 + ; GCN-NEXT: ds_bpermute_b32 v5, v196, v4 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31] ; GCN-NEXT: v_add_f32_e32 v2, v4, v5 - ; GCN-NEXT: ds_bpermute_b32 v3, v133, v2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111] - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7] + ; GCN-NEXT: ds_bpermute_b32 v3, v196, v2 ; GCN-NEXT: ; implicit-def: $vgpr4 - ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[12:13] + ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v112 + ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47] ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 7959cee..e174fc1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -156,62 +156,62 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 -; GCN-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[0:3], v1 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v3 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[128:131], v3 offset:8192 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index aa099b6..b65a1a8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -623,62 +623,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[128:131], v1 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: ds_read_b128 a[60:63], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v2 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v2 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v2 offset:57392 -; GCN-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[128:131], v3 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576 +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152 +; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; GCN-NEXT: s_waitcnt lgkmcnt(14) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] ; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; GCN-NEXT: s_nop 12 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; GCN-NEXT: s_nop 11 ; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 @@ -729,62 +729,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:24576 -; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 1.0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v2 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v2 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v2 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v2 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v2 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v2 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v2 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v2 offset:57392 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v3, s0, v0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v3 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v3 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v3 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v3 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v3 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v3 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:24576 +; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:49152 +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; EXACTCUTOFF-NEXT: s_nop 12 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; EXACTCUTOFF-NEXT: s_nop 11 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll index 56f9c5d..d578d2e 100644 --- a/llvm/test/CodeGen/AMDGPU/minmax.ll +++ b/llvm/test/CodeGen/AMDGPU/minmax.ll @@ -612,10 +612,10 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z) ; GFX1250-NEXT: v_med3_num_f32 v2, v2, v3, v4 ; GFX1250-NEXT: global_store_b32 v[0:1], v2, off ; GFX1250-NEXT: s_set_pc_i64 s[30:31] - %tmp0 = call float @llvm.minnum.f32(float %x, float %y) - %tmp1 = call float @llvm.maxnum.f32(float %x, float %y) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z) - %tmp3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %x, float %y) + %tmp1 = call nnan float @llvm.maxnum.f32(float %x, float %y) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %z) + %tmp3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %tmp3, ptr addrspace(1) %arg ret void } @@ -646,10 +646,10 @@ define void @test_med3_minimumnum_maximumnum_f32(ptr addrspace(1) %arg, float %x ; GFX1250-NEXT: v_med3_num_f32 v2, v2, v3, v4 ; GFX1250-NEXT: global_store_b32 v[0:1], v2, off ; GFX1250-NEXT: s_set_pc_i64 s[30:31] - %tmp0 = call float @llvm.minimumnum.f32(float %x, float %y) - %tmp1 = call float @llvm.maximumnum.f32(float %x, float %y) - %tmp2 = call float @llvm.minimumnum.f32(float %tmp1, float %z) - %tmp3 = call float @llvm.maximumnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minimumnum.f32(float %x, float %y) + %tmp1 = call nnan float @llvm.maximumnum.f32(float %x, float %y) + %tmp2 = call nnan float @llvm.minimumnum.f32(float %tmp1, float %z) + %tmp3 = call nnan float @llvm.maximumnum.f32(float %tmp0, float %tmp2) store float %tmp3, ptr addrspace(1) %arg ret void } @@ -1280,10 +1280,10 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0 ; GISEL-GFX1250-FAKE16-NEXT: v_med3_num_f16 v2, v2, v3, v4 ; GISEL-GFX1250-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GISEL-GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] - %tmp0 = call half @llvm.minnum.f16(half %x, half %y) - %tmp1 = call half @llvm.maxnum.f16(half %x, half %y) - %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z) - %tmp3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2) + %tmp0 = call nnan half @llvm.minnum.f16(half %x, half %y) + %tmp1 = call nnan half @llvm.maxnum.f16(half %x, half %y) + %tmp2 = call nnan half @llvm.minnum.f16(half %tmp1, half %z) + %tmp3 = call nnan half @llvm.maxnum.f16(half %tmp0, half %tmp2) store half %tmp3, ptr addrspace(1) %arg ret void } diff --git a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll index ddbae64..a95d8c7 100644 --- a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll +++ b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll @@ -1,8 +1,8 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck %s -check-prefixes=GCN,GFX700 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900 +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100 +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100 declare i64 @llvm.readsteadycounter() #0 diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 9a23788..8803f3a 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -367,77 +367,76 @@ bb: define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK-LABEL: illegal_mfma_after_rewrite: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[4:5] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[16:19] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, s1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7] -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; CHECK-NEXT: v_mov_b32_e32 v5, v4 -; CHECK-NEXT: v_mov_b32_e32 v6, v4 -; CHECK-NEXT: v_mov_b32_e32 v7, v4 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9] +; CHECK-NEXT: s_nop 3 +; CHECK-NEXT: v_cvt_f16_f32_e32 v24, v4 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3] ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[4:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7] -; CHECK-NEXT: s_nop 5 -; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15] -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3] -; CHECK-NEXT: global_store_short v[12:13], v17, off +; CHECK-NEXT: v_mov_b32_e32 v8, 0x7fc00000 +; CHECK-NEXT: v_mov_b32_e32 v9, v8 +; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v8 +; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v6 +; CHECK-NEXT: v_mov_b64_e32 v[0:1], 0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11] +; CHECK-NEXT: global_store_short v[0:1], v2, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: global_store_short v[12:13], v9, off -; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19] +; CHECK-NEXT: s_nop 5 +; CHECK-NEXT: v_cvt_f16_f32_e32 v10, v6 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15] +; CHECK-NEXT: global_store_short v[0:1], v10, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0 -; CHECK-NEXT: global_store_short v[12:13], v1, off -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23] +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CHECK-NEXT: global_store_short v[0:1], v6, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[12:13], v14, off +; CHECK-NEXT: global_store_short v[0:1], v24, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5] ; CHECK-NEXT: s_nop 6 -; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7] -; CHECK-NEXT: global_store_short v[12:13], v8, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v2 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19] +; CHECK-NEXT: global_store_short v[0:1], v6, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 ; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CHECK-NEXT: global_store_short v[12:13], v0, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CHECK-NEXT: global_store_short v[0:1], v2, off ; CHECK-NEXT: s_endpgm entry: %k0 = call <4 x float> asm sideeffect "; def $0", "=s"() @@ -546,100 +545,14 @@ define void @test_rewrite_mfma_subreg_insert2(double %arg0, double %arg1, ptr ad define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) #0 { ; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class: ; CHECK: ; %bb.0: +; CHECK-NEXT: v_accvgpr_write_b32 a34, 2.0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 -; CHECK-NEXT: v_accvgpr_read_b32 v63, a31 -; CHECK-NEXT: v_accvgpr_read_b32 v62, a30 -; CHECK-NEXT: v_accvgpr_read_b32 v61, a29 -; CHECK-NEXT: v_accvgpr_read_b32 v60, a28 -; CHECK-NEXT: v_accvgpr_read_b32 v59, a27 -; CHECK-NEXT: v_accvgpr_read_b32 v58, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v57, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v56, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v55, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v54, a22 -; CHECK-NEXT: v_accvgpr_read_b32 v53, a21 -; CHECK-NEXT: v_accvgpr_read_b32 v52, a20 -; CHECK-NEXT: v_accvgpr_read_b32 v51, a19 -; CHECK-NEXT: v_accvgpr_read_b32 v50, a18 -; CHECK-NEXT: v_accvgpr_read_b32 v49, a17 -; CHECK-NEXT: v_accvgpr_read_b32 v48, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v47, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v46, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v45, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v44, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v43, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v42, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v41, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v40, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v39, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v38, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v37, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v36, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v35, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v34, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v33, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v32, a0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, 2.0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, 4.0 -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[32:63] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v32 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v33 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v34 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v35 -; CHECK-NEXT: v_accvgpr_write_b32 a4, v36 -; CHECK-NEXT: v_accvgpr_write_b32 a5, v37 -; CHECK-NEXT: v_accvgpr_write_b32 a6, v38 -; CHECK-NEXT: v_accvgpr_write_b32 a7, v39 -; CHECK-NEXT: v_accvgpr_write_b32 a8, v40 -; CHECK-NEXT: v_accvgpr_write_b32 a9, v41 -; CHECK-NEXT: v_accvgpr_write_b32 a10, v42 -; CHECK-NEXT: v_accvgpr_write_b32 a11, v43 -; CHECK-NEXT: v_accvgpr_write_b32 a12, v44 -; CHECK-NEXT: v_accvgpr_write_b32 a13, v45 -; CHECK-NEXT: v_accvgpr_write_b32 a14, v46 -; CHECK-NEXT: v_accvgpr_write_b32 a15, v47 -; CHECK-NEXT: v_accvgpr_write_b32 a16, v48 -; CHECK-NEXT: v_accvgpr_write_b32 a17, v49 -; CHECK-NEXT: v_accvgpr_write_b32 a18, v50 -; CHECK-NEXT: v_accvgpr_write_b32 a19, v51 -; CHECK-NEXT: v_accvgpr_write_b32 a20, v52 -; CHECK-NEXT: v_accvgpr_write_b32 a21, v53 -; CHECK-NEXT: v_accvgpr_write_b32 a22, v54 -; CHECK-NEXT: v_accvgpr_write_b32 a23, v55 -; CHECK-NEXT: v_accvgpr_write_b32 a24, v56 -; CHECK-NEXT: v_accvgpr_write_b32 a25, v57 -; CHECK-NEXT: v_accvgpr_write_b32 a26, v58 -; CHECK-NEXT: v_accvgpr_write_b32 a27, v59 -; CHECK-NEXT: v_accvgpr_write_b32 a28, v60 -; CHECK-NEXT: v_accvgpr_write_b32 a29, v61 -; CHECK-NEXT: v_accvgpr_write_b32 a30, v62 -; CHECK-NEXT: v_accvgpr_write_b32 a31, v63 -; CHECK-NEXT: v_mov_b32_e32 v33, 0x41000000 -; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000 -; CHECK-NEXT: v_accvgpr_read_b32 v32, a32 -; CHECK-NEXT: v_and_b32_e32 v32, 0x3ff, v32 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31] -; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v32 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] -; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 @@ -663,18 +576,60 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: v_accvgpr_read_b32 v21, a21 ; CHECK-NEXT: v_accvgpr_read_b32 v22, a22 ; CHECK-NEXT: v_accvgpr_read_b32 v23, a23 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 ; CHECK-NEXT: v_accvgpr_read_b32 v28, a28 ; CHECK-NEXT: v_accvgpr_read_b32 v29, a29 ; CHECK-NEXT: v_accvgpr_read_b32 v30, a30 ; CHECK-NEXT: v_accvgpr_read_b32 v31, a31 -; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 -; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 -; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 -; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 -; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 -; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] -; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 +; CHECK-NEXT: v_accvgpr_write_b32 a33, 4.0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a34, a33, v[0:31] +; CHECK-NEXT: v_mov_b32_e32 v1, 0x41000000 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a32 +; CHECK-NEXT: s_nop 15 +; CHECK-NEXT: v_mov_b64_e32 v[2:3], v[32:33] +; CHECK-NEXT: v_mov_b64_e32 v[4:5], v[34:35] +; CHECK-NEXT: v_mov_b64_e32 v[6:7], v[36:37] +; CHECK-NEXT: v_mov_b64_e32 v[8:9], v[38:39] +; CHECK-NEXT: v_mov_b64_e32 v[10:11], v[40:41] +; CHECK-NEXT: v_mov_b64_e32 v[12:13], v[42:43] +; CHECK-NEXT: v_mov_b64_e32 v[14:15], v[44:45] +; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[46:47] +; CHECK-NEXT: v_mov_b64_e32 v[18:19], v[48:49] +; CHECK-NEXT: v_mov_b64_e32 v[20:21], v[50:51] +; CHECK-NEXT: v_mov_b64_e32 v[22:23], v[52:53] +; CHECK-NEXT: v_mov_b64_e32 v[24:25], v[54:55] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[56:57] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[58:59] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[60:61] +; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[62:63] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, v[22:25], s[0:1] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, v[18:21], s[0:1] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x41800000 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] +; CHECK-NEXT: s_nop 15 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[2:3] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[2:3] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[2:3] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[2:3] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 ; CHECK-NEXT: s_endpgm %src2 = call <32 x float> asm sideeffect "; def $0", "=a"() %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir new file mode 100644 index 0000000..33b2f69 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir @@ -0,0 +1,32 @@ +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler --misched-prera-direction=topdown -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Check that cycle counts are consistent with hazards. + +# CHECK: Cycle: 3 TopQ.A +# CHECK: hazard: SU(6) HWXDL[0]=9c, is later than CurrCycle = 3c +# CHECK-NOT: Cycle: 9 TopQ.A +# CHECK: Cycle: 83 TopQ.A +# CHECK: Checking pending node SU(6) +# CHECK: Move SU(6) into Available Q + +--- +name: pending_queue_ready_cycle +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr4_sgpr5 + + %2:sgpr_128 = IMPLICIT_DEF + %14:vgpr_32 = IMPLICIT_DEF + %15:vgpr_32 = IMPLICIT_DEF + %18:areg_512 = IMPLICIT_DEF + %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, implicit $exec + %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec + undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %5.sub0, %14, implicit $exec + %7:vreg_512 = COPY %18 + SCHED_BARRIER 0 + S_NOP 0, implicit %18, implicit %7, implicit %84 + S_ENDPGM 0 +... |