aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmed3.ll275
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir1854
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll96
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll204
-rw-r--r--llvm/test/CodeGen/AMDGPU/minmax.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/readsteadycounter.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll239
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir32
11 files changed, 1363 insertions, 1386 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
index 26b9d99..8705647 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
@@ -206,7 +206,7 @@ define <2 x half> @test_max_K0min_K1Val_v2f16(<2 x half> %a) #1 {
; global nnan function attribute always forces clamp combine
-define float @test_min_max_global_nnan(float %a) #3 {
+define float @test_min_max_global_nnan(float %a) {
; GFX10-LABEL: test_min_max_global_nnan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -223,11 +223,11 @@ define float @test_min_max_global_nnan(float %a) #3 {
; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call float @llvm.maxnum.f32(float %a, float 0.0)
- %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0)
+ %fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 1.0)
ret float %fmed
}
-define float @test_max_min_global_nnan(float %a) #3 {
+define float @test_max_min_global_nnan(float %a) {
; GFX10-LABEL: test_max_min_global_nnan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -244,7 +244,7 @@ define float @test_max_min_global_nnan(float %a) #3 {
; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call float @llvm.minnum.f32(float %a, float 1.0)
- %fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0)
+ %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 0.0)
ret float %fmed
}
@@ -414,5 +414,4 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
attributes #0 = {"amdgpu-ieee"="true"}
attributes #1 = {"amdgpu-ieee"="false"}
attributes #2 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="true"}
-attributes #3 = {"no-nans-fp-math"="true"}
attributes #4 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="false"}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
index d2c93e7..696a87b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
@@ -232,7 +232,7 @@ define half @test_max_K0min_K1Val_f16(half %a) #1 {
; global nnan function attribute always forces fmed3 combine
-define float @test_min_max_global_nnan(float %a) #2 {
+define float @test_min_max_global_nnan(float %a) {
; GFX10-LABEL: test_min_max_global_nnan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -254,12 +254,12 @@ define float @test_min_max_global_nnan(float %a) #2 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
- %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0)
+ %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
%fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0)
ret float %fmed
}
-define float @test_max_min_global_nnan(float %a) #2 {
+define float @test_max_min_global_nnan(float %a) {
; GFX10-LABEL: test_max_min_global_nnan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -281,8 +281,8 @@ define float @test_max_min_global_nnan(float %a) #2 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
- %minnum = call float @llvm.minnum.f32(float %a, float 4.0)
- %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
+ %minnum = call nnan float @llvm.minnum.f32(float %a, float 4.0)
+ %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 2.0)
ret float %fmed
}
@@ -560,4 +560,3 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>)
declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
attributes #0 = {"amdgpu-ieee"="true"}
attributes #1 = {"amdgpu-ieee"="false"}
-attributes #2 = {"no-nans-fp-math"="true"}
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 9e15225..3145a27 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -10,7 +10,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -120,7 +120,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
ret void
}
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -231,7 +231,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
ret void
}
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -342,7 +342,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -453,7 +453,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -569,7 +569,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
ret void
}
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -740,7 +740,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
ret void
}
-define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f64:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -955,14 +955,14 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
%outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
%a = load float, ptr addrspace(1) %gep0
- %max = call float @llvm.maxnum.f32(float %a, float 2.0)
- %med = call float @llvm.minnum.f32(float %max, float 4.0)
+ %max = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
+ %med = call nnan float @llvm.minnum.f32(float %max, float 4.0)
store float %med, ptr addrspace(1) %outgep
ret void
}
-define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1297,10 +1297,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
%a.fneg = fsub float -0.0, %a
- %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -1487,10 +1487,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
%b.fneg = fsub float -0.0, %b
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b.fneg)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b.fneg)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b.fneg)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b.fneg)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -1677,10 +1677,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
%c.fneg = fsub float -0.0, %c
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fneg)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fneg)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -1872,14 +1872,14 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
%c = load volatile float, ptr addrspace(1) %gep2
%a.fneg = fsub float -0.0, %a
- %b.fabs = call float @llvm.fabs.f32(float %b)
- %c.fabs = call float @llvm.fabs.f32(float %c)
+ %b.fabs = call nnan float @llvm.fabs.f32(float %b)
+ %c.fabs = call nnan float @llvm.fabs.f32(float %c)
%c.fabs.fneg = fsub float -0.0, %c.fabs
- %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
- %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
@@ -2082,16 +2082,16 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
%c.fabs = call float @llvm.fabs.f32(float %c)
%c.fabs.fneg = fsub float -0.0, %c.fabs
- %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
- %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
-define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2266,7 +2266,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
ret void
}
-define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2418,7 +2418,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
ret void
}
-define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_call_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2570,7 +2570,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
ret void
}
-define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_fast_call_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2878,10 +2878,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3030,10 +3030,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3220,10 +3220,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
%a.fneg = fsub float -0.0, %a
- %tmp0 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
- %tmp1 = call float @llvm.minnum.f32(float %a.fneg, float %b)
- %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b)
+ %tmp1 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b)
+ %tmp2 = call nnan float @llvm.maxnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.minnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3372,10 +3372,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3524,10 +3524,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3676,10 +3676,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3828,10 +3828,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3980,10 +3980,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -4132,10 +4132,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -4284,10 +4284,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -4436,10 +4436,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -4588,10 +4588,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -4740,10 +4740,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -4892,10 +4892,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -5044,10 +5044,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -5196,10 +5196,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -5348,10 +5348,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -5503,10 +5503,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.maxnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.minnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -5515,7 +5515,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
; Negative patterns
; ---------------------------------------------------------------------
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -5717,7 +5717,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -5944,7 +5944,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6146,7 +6146,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6352,7 +6352,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
ret void
}
-define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6527,7 +6527,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6702,7 +6702,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6877,7 +6877,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -7270,10 +7270,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
%a.fneg = fsub float -0.0, %a
- %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -7428,13 +7428,13 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %max = call float @llvm.maxnum.f32(float %a, float %b)
- %minmax = call float @llvm.minnum.f32(float %max, float %c)
+ %max = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %minmax = call nnan float @llvm.minnum.f32(float %max, float %c)
store float %minmax, ptr addrspace(1) %outgep
ret void
}
-define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -7597,7 +7597,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
ret void
}
-define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -7865,7 +7865,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
ret void
}
-define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: two_non_inline_constant:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -7998,7 +7998,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
}
; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants.
-define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: one_non_inline_constant:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -8137,7 +8137,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
ret void
}
-define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: two_non_inline_constant_multi_use:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -8343,7 +8343,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
ret void
}
-define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 {
+define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) {
; SI-LABEL: v_test_fmed3_r_i_i_f32_minimumnum_maximumnum:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8384,7 +8384,7 @@ define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 {
ret float %med
}
-define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %a) #1 {
+define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %a) {
; SI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8452,7 +8452,7 @@ define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %
ret <2 x float> %med
}
-define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(float %a) #1 {
+define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(float %a) {
; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8525,7 +8525,7 @@ define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(
ret { float, float } %ins.1
}
-define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) {
; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8567,7 +8567,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a)
ret float %med
}
-define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) {
; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8609,7 +8609,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a)
ret float %med
}
-define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) {
; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8651,7 +8651,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) #1 {
ret float %med
}
-define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) {
; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8693,7 +8693,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) #1 {
ret float %med
}
-define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
+define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) {
; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8772,7 +8772,7 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
ret half %med
}
-define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) #1 {
+define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) {
; SI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8848,7 +8848,7 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a)
ret <2 x half> %med
}
-define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) #1 {
+define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) {
; SI-LABEL: v_test_fmed3_r_i_i_f64_minimumnum_maximumnum:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8905,5 +8905,4 @@ declare half @llvm.minnum.f16(half, half) #0
declare half @llvm.maxnum.f16(half, half) #0
attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 6b09424..eee232a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -49,7 +49,6 @@ bb:
ret void
}
-; FIXME: This generates "instid1(/* invalid instid value */)".
define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
; GFX11-LABEL: f2:
; GFX11: ; %bb.0: ; %bb
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index b07dec3..689d147 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -6,1153 +6,1147 @@
define amdgpu_kernel void @largeInterleave() #0 { ret void }
; GCN-LABEL: largeInterleave:
; GCN: ; %bb.0:
- ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: ; implicit-def: $vgpr0
- ; GCN-NEXT: ; implicit-def: $vgpr2
- ; GCN-NEXT: ; implicit-def: $vgpr1
- ; GCN-NEXT: ; implicit-def: $vgpr8
- ; GCN-NEXT: ; implicit-def: $vgpr94
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ; implicit-def: $vgpr106
- ; GCN-NEXT: ; implicit-def: $vgpr132
- ; GCN-NEXT: ; implicit-def: $vgpr133
- ; GCN-NEXT: ; implicit-def: $vgpr139
- ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
- ; GCN-NEXT: ; iglp_opt mask(0x00000002)
- ; GCN-NEXT: ; implicit-def: $sgpr0
+ ; GCN-NEXT: ; implicit-def: $vgpr16
+ ; GCN-NEXT: ; implicit-def: $vgpr25
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
- ; GCN-NEXT: v_readfirstlane_b32 s7, v0
+ ; GCN-NEXT: v_readfirstlane_b32 s17, v16
+ ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: ; implicit-def: $vgpr17
+ ; GCN-NEXT: ; implicit-def: $sgpr15
; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: ; implicit-def: $sgpr5
- ; GCN-NEXT: s_nop 1
- ; GCN-NEXT: v_lshl_add_u32 v0, s7, 4, v2
- ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6
- ; GCN-NEXT: v_add_lshl_u32 v92, v0, v1, 1
- ; GCN-NEXT: v_add_u32_e32 v93, s0, v92
- ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: s_lshl_b32 s18, s17, 7
+ ; GCN-NEXT: ; implicit-def: $vgpr18
+ ; GCN-NEXT: v_add_lshl_u32 v230, v18, s18, 1
+ ; GCN-NEXT: v_lshl_add_u32 v25, s17, 4, v25
+ ; GCN-NEXT: v_mul_lo_u32 v25, v25, s6
+ ; GCN-NEXT: v_add_lshl_u32 v226, v25, v17, 1
+ ; GCN-NEXT: v_add_u32_e32 v17, s15, v226
+ ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v17, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: s_lshl_b32 s0, s7, 7
- ; GCN-NEXT: v_add_lshl_u32 v95, v8, s0, 1
- ; GCN-NEXT: v_add_u32_e32 v8, 64, v93
- ; GCN-NEXT: ; kill: killed $vgpr8
+ ; GCN-NEXT: v_add_u32_e32 v72, 64, v17
+ ; GCN-NEXT: ; implicit-def: $vgpr213
+ ; GCN-NEXT: ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155
+ ; GCN-NEXT: ; implicit-def: $vgpr246
+ ; GCN-NEXT: v_add_u32_e32 v188, 0x80, v17
+ ; GCN-NEXT: ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159
+ ; GCN-NEXT: ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147
+ ; GCN-NEXT: ; implicit-def: $vgpr19
+ ; GCN-NEXT: ; implicit-def: $vgpr26
+ ; GCN-NEXT: ; implicit-def: $vgpr27
+ ; GCN-NEXT: v_add_u32_e32 v227, 0xc0, v17
+ ; GCN-NEXT: v_add_u32_e32 v231, v19, v26
+ ; GCN-NEXT: v_add_u32_e32 v232, v19, v27
; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: ; kill: killed $vgpr92
- ; GCN-NEXT: ; implicit-def: $sgpr6
+ ; GCN-NEXT: ; implicit-def: $vgpr28
+ ; GCN-NEXT: ; implicit-def: $vgpr29
+ ; GCN-NEXT: v_add_u32_e32 v233, v19, v28
+ ; GCN-NEXT: v_add_u32_e32 v234, v19, v29
+ ; GCN-NEXT: ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143
+ ; GCN-NEXT: ; implicit-def: $sgpr5
+ ; GCN-NEXT: ; implicit-def: $sgpr7
+ ; GCN-NEXT: ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151
+ ; GCN-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139
+ ; GCN-NEXT: ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135
+ ; GCN-NEXT: ; implicit-def: $vgpr20
+ ; GCN-NEXT: v_add_u32_e32 v18, s17, v20
+ ; GCN-NEXT: v_and_b32_e32 v18, 0x1fffffff, v18
+ ; GCN-NEXT: ; implicit-def: $sgpr16
+ ; GCN-NEXT: v_mul_lo_u32 v18, v18, s16
+ ; GCN-NEXT: ; implicit-def: $vgpr21
+ ; GCN-NEXT: v_add_lshl_u32 v199, v21, v18, 1
+ ; GCN-NEXT: ; implicit-def: $vgpr22
+ ; GCN-NEXT: v_lshl_add_u32 v200, v22, 1, v199
+ ; GCN-NEXT: ; implicit-def: $vgpr23
+ ; GCN-NEXT: v_lshl_add_u32 v201, v23, 1, v200
+ ; GCN-NEXT: ; implicit-def: $vgpr24
+ ; GCN-NEXT: v_lshl_add_u32 v202, v24, 1, v201
+ ; GCN-NEXT: ; implicit-def: $vgpr16
+ ; GCN-NEXT: ; implicit-def: $vgpr18
+ ; GCN-NEXT: ; implicit-def: $vgpr20
+ ; GCN-NEXT: ; implicit-def: $vgpr24
+ ; GCN-NEXT: v_add_u32_e32 v247, v19, v24
+ ; GCN-NEXT: v_add_u32_e32 v248, v19, v16
+ ; GCN-NEXT: v_add_u32_e32 v249, v19, v18
+ ; GCN-NEXT: v_add_u32_e32 v250, v19, v20
+ ; GCN-NEXT: ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131
+ ; GCN-NEXT: ; implicit-def: $sgpr14
+ ; GCN-NEXT: ; implicit-def: $vgpr196
+ ; GCN-NEXT: ; implicit-def: $sgpr12_sgpr13
+ ; GCN-NEXT: ; implicit-def: $vgpr211
+ ; GCN-NEXT: v_max_f32_e32 v212, v211, v211
+ ; GCN-NEXT: ; implicit-def: $vgpr198
+ ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GCN-NEXT: ; implicit-def: $vgpr32
+ ; GCN-NEXT: ; implicit-def: $vgpr33
+ ; GCN-NEXT: ; implicit-def: $vgpr34
+ ; GCN-NEXT: v_add_u32_e32 v210, v19, v34
+ ; GCN-NEXT: v_add_u32_e32 v206, v19, v33
+ ; GCN-NEXT: v_add_u32_e32 v205, v19, v32
+ ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; GCN-NEXT: ; implicit-def: $vgpr21
+ ; GCN-NEXT: ; implicit-def: $vgpr22
+ ; GCN-NEXT: ; implicit-def: $vgpr23
+ ; GCN-NEXT: ; implicit-def: $vgpr30
+ ; GCN-NEXT: ; implicit-def: $vgpr31
+ ; GCN-NEXT: v_add_u32_e32 v207, v19, v21
+ ; GCN-NEXT: v_add_u32_e32 v208, v19, v22
+ ; GCN-NEXT: v_add_u32_e32 v209, v19, v23
+ ; GCN-NEXT: v_add_u32_e32 v203, v19, v30
+ ; GCN-NEXT: v_add_u32_e32 v204, v19, v31
+ ; GCN-NEXT: ; kill: killed $vgpr17
+ ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; GCN-NEXT: ; implicit-def: $vgpr197
+ ; GCN-NEXT: ; iglp_opt mask(0x00000002)
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v95, v[0:3]
+ ; GCN-NEXT: ds_write_b128 v230, v[64:67]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v95, v[4:7] offset:1024
+ ; GCN-NEXT: ds_write_b128 v230, v[68:71] offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN-NEXT: ds_read_b128 v[64:67], v213
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111]
+ ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0
- ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[168:171], v213 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106
+ ; GCN-NEXT: ds_read_b128 v[172:175], v246
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:512
+ ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[84:87], v106 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[180:183], v246 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v95, v[64:67]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_add_u32_e32 v72, 0x80, v93
+ ; GCN-NEXT: ds_write_b128 v230, v[160:163]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024
+ ; GCN-NEXT: ds_write_b128 v230, v[164:167] offset:1024
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:128 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v188, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: ; kill: killed $vgpr72
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN-NEXT: ds_read_b128 v[188:191], v213
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512
+ ; GCN-NEXT: ds_read_b128 v[192:195], v213 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[164:167], v213 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[214:217], v213 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[218:221], v246
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512
+ ; GCN-NEXT: ds_read_b128 v[222:225], v246 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[168:171], v246 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[188:189], v[144:145], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[190:191], v[146:147], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[188:191], v246 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v95, v[64:67]
+ ; GCN-NEXT: ds_write_b128 v230, v[152:155]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024
- ; GCN-NEXT: ; implicit-def: $vgpr64
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
- ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93
- ; GCN-NEXT: ; implicit-def: $vgpr73
- ; GCN-NEXT: v_add_u32_e32 v76, v132, v64
+ ; GCN-NEXT: ds_write_b128 v230, v[160:163] offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79]
+ ; GCN-NEXT: buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; kill: killed $vgpr72
- ; GCN-NEXT: v_add_u32_e32 v72, v132, v73
- ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr74
- ; GCN-NEXT: v_add_u32_e32 v72, v132, v74
- ; GCN-NEXT: ; implicit-def: $vgpr75
- ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v72, v132, v75
- ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79]
+ ; GCN-NEXT: v_perm_b32 v238, v162, v160, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127]
+ ; GCN-NEXT: v_perm_b32 v240, v162, v160, s7
+ ; GCN-NEXT: v_perm_b32 v242, v163, v161, s5
+ ; GCN-NEXT: v_perm_b32 v244, v163, v161, s7
+ ; GCN-NEXT: ds_read_b128 v[160:163], v213
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; kill: killed $vgpr76
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ; implicit-def: $sgpr8
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512
+ ; GCN-NEXT: v_perm_b32 v239, v174, v172, s5
+ ; GCN-NEXT: v_perm_b32 v241, v174, v172, s7
+ ; GCN-NEXT: v_perm_b32 v243, v175, v173, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79]
+ ; GCN-NEXT: v_perm_b32 v245, v175, v173, s7
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[218:221], v213 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[172:175], v213 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[146:147], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[178:179], v[158:159], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[148:149], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[140:141], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[192:193], v[144:145], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[150:151], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[160:163], v213 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106
+ ; GCN-NEXT: ds_read_b128 v[184:187], v246
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512
+ ; GCN-NEXT: ds_read_b128 v[214:217], v246 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[142:143], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[146:147], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[148:149], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[156:157], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[140:141], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[150:151], v[64:79]
+ ; GCN-NEXT: ds_read_b128 v[160:163], v246 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v95, v[64:67]
+ ; GCN-NEXT: ds_write_b128 v230, v[152:155]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024
+ ; GCN-NEXT: ds_write_b128 v230, v[226:229] offset:1024
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[158:159], v[80:95]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[64:67], v94
+ ; GCN-NEXT: ds_read_b128 v[156:159], v213
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[90:93], v94 offset:512
+ ; GCN-NEXT: ds_read_b128 v[226:229], v213 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71
- ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[180:183], v213 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
- ; GCN-NEXT: ds_read_b128 v[76:79], v94 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[152:155], v213 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[94:97], v106
+ ; GCN-NEXT: ds_read_b128 v[230:233], v246
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63]
- ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:512
+ ; GCN-NEXT: ds_read_b128 v[234:237], v246 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:1024
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[224:225], v[142:143], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[148:149], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[156:159], v246 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63]
- ; GCN-NEXT: v_perm_b32 v94, v102, v98, s5
- ; GCN-NEXT: v_perm_b32 v98, v102, v98, s8
- ; GCN-NEXT: v_perm_b32 v102, v103, v99, s5
- ; GCN-NEXT: v_perm_b32 v95, v104, v100, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63]
- ; GCN-NEXT: v_perm_b32 v96, v103, v99, s8
- ; GCN-NEXT: v_perm_b32 v99, v104, v100, s8
- ; GCN-NEXT: v_perm_b32 v103, v105, v101, s5
- ; GCN-NEXT: v_perm_b32 v97, v105, v101, s8
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47]
- ; GCN-NEXT: s_nop 5
- ; GCN-NEXT: v_mul_f32_e32 v100, s4, v48
- ; GCN-NEXT: v_mul_f32_e32 v101, s4, v49
- ; GCN-NEXT: v_max3_f32 v92, v100, s6, v101
- ; GCN-NEXT: v_mul_f32_e32 v93, s4, v50
- ; GCN-NEXT: v_mul_f32_e32 v100, s4, v51
- ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100
- ; GCN-NEXT: v_mul_f32_e32 v93, s4, v52
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31]
- ; GCN-NEXT: v_mul_f32_e32 v100, s4, v53
- ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100
- ; GCN-NEXT: v_mul_f32_e32 v84, s4, v54
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v55
- ; GCN-NEXT: v_max3_f32 v84, v92, v84, v85
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v56
- ; GCN-NEXT: v_mul_f32_e32 v92, s4, v57
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15]
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v92
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v58
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v59
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v60
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v61
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47]
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v62
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v63
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: ; implicit-def: $sgpr6
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31]
- ; GCN-NEXT: s_nop 6
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v32
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v33
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v34
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v35
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v36
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15]
- ; GCN-NEXT: v_mul_f32_e32 v86, s4, v37
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v38
- ; GCN-NEXT: v_mul_f32_e32 v86, s4, v39
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v40
- ; GCN-NEXT: v_mul_f32_e32 v80, s4, v41
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31]
- ; GCN-NEXT: v_max3_f32 v80, v84, v85, v80
- ; GCN-NEXT: v_mul_f32_e32 v81, s4, v42
- ; GCN-NEXT: v_mul_f32_e32 v84, s4, v43
- ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84
- ; GCN-NEXT: v_mul_f32_e32 v81, s4, v44
- ; GCN-NEXT: v_mul_f32_e32 v84, s4, v45
- ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15]
- ; GCN-NEXT: v_mul_f32_e32 v81, s4, v46
- ; GCN-NEXT: v_mul_f32_e32 v82, s4, v47
- ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82
- ; GCN-NEXT: v_mul_f32_e32 v81, s4, v16
- ; GCN-NEXT: v_mul_f32_e32 v82, s4, v17
- ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82
- ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15]
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19
- ; GCN-NEXT: v_max3_f32 v68, v80, v68, v69
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v20
- ; GCN-NEXT: v_mul_f32_e32 v76, s4, v21
- ; GCN-NEXT: v_max3_f32 v68, v68, v69, v76
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v22
- ; GCN-NEXT: v_mul_f32_e32 v70, s4, v23
- ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v24
- ; GCN-NEXT: v_mul_f32_e32 v70, s4, v25
- ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v26
- ; GCN-NEXT: v_mul_f32_e32 v70, s4, v27
- ; GCN-NEXT: v_max3_f32 v64, v68, v69, v70
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v28
- ; GCN-NEXT: v_mul_f32_e32 v68, s4, v29
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v30
- ; GCN-NEXT: v_mul_f32_e32 v68, s4, v31
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v0
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v1
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v2
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v3
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v4
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v5
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v6
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v7
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v8
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v9
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v10
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v11
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v12
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v13
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v14
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v15
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: ; implicit-def: $vgpr65
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: ; implicit-def: $vgpr68
- ; GCN-NEXT: ; implicit-def: $vgpr67
- ; GCN-NEXT: v_add_u32_e32 v65, s7, v65
- ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65
- ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6
- ; GCN-NEXT: v_add_lshl_u32 v135, v66, v65, 1
- ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: v_lshl_add_u32 v136, v66, 1, v135
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: v_lshl_add_u32 v137, v66, 1, v136
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7
- ; GCN-NEXT: v_lshl_add_u32 v138, v66, 1, v137
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v135, v[94:95]
- ; GCN-NEXT: v_max_f32_e32 v65, v65, v65
- ; GCN-NEXT: v_max_f32_e32 v64, v64, v65
- ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64
+ ; GCN-NEXT: ds_write_b64 v199, v[238:239]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v136, v[98:99]
+ ; GCN-NEXT: ds_write_b64 v200, v[240:241]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v137, v[102:103]
+ ; GCN-NEXT: ds_write_b64 v201, v[242:243]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v138, v[96:97]
- ; GCN-NEXT: v_add_u32_e32 v68, v132, v68
- ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7]
- ; GCN-NEXT: v_max_f32_e32 v64, v64, v64
- ; GCN-NEXT: ; implicit-def: $vgpr65
- ; GCN-NEXT: v_max_f32_e32 v66, v65, v65
- ; GCN-NEXT: v_max_f32_e32 v134, v66, v64
- ; GCN-NEXT: ; implicit-def: $vgpr64
+ ; GCN-NEXT: ds_write_b64 v202, v[244:245]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v64, v132, v64
- ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[150:151], v[96:111]
+ ; GCN-NEXT: buffer_load_dwordx2 v[194:195], v248, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: v_add_u32_e32 v64, v132, v66
- ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[218:219], v249, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v64, v132, v67
- ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[220:221], v250, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134
- ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134
- ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134
- ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
- ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134
- ; GCN-NEXT: v_exp_f32_e32 v163, v57
- ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96
- ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134
- ; GCN-NEXT: v_exp_f32_e32 v164, v57
- ; GCN-NEXT: v_exp_f32_e32 v49, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64
- ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134
- ; GCN-NEXT: v_exp_f32_e32 v50, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v66
- ; GCN-NEXT: v_fma_f32 v68, s4, v52, -v134
- ; GCN-NEXT: v_exp_f32_e32 v51, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v67
- ; GCN-NEXT: v_fma_f32 v69, s4, v53, -v134
- ; GCN-NEXT: v_exp_f32_e32 v52, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v68
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_fma_f32 v70, s4, v54, -v134
- ; GCN-NEXT: v_exp_f32_e32 v53, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v69
- ; GCN-NEXT: v_fma_f32 v71, s4, v55, -v134
- ; GCN-NEXT: ds_read_b128 v[140:143], v139
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v54, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70
- ; GCN-NEXT: v_exp_f32_e32 v55, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71
- ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134
- ; GCN-NEXT: v_exp_f32_e32 v56, v48
- ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49
- ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50
- ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51
- ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
- ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v48, v48
- ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58
- ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67
- ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66
- ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55
- ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56
- ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
- ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134
- ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
- ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48
- ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48
- ; GCN-NEXT: v_mul_f32_e64 v84, v84, v48
- ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48
- ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48
- ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48
- ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
- ; GCN-NEXT: v_exp_f32_e32 v58, v58
- ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
- ; GCN-NEXT: v_mul_f32_e64 v98, v98, v48
- ; GCN-NEXT: v_mul_f32_e64 v99, v99, v48
- ; GCN-NEXT: v_mul_f32_e64 v100, v100, v48
- ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48
- ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48
- ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48
- ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57
- ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59
- ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53
- ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54
- ; GCN-NEXT: v_exp_f32_e32 v59, v57
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
- ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134
- ; GCN-NEXT: v_mul_f32_e64 v112, v112, v48
- ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48
- ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48
- ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48
- ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134
- ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
- ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134
- ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60
- ; GCN-NEXT: ; implicit-def: $vgpr57
- ; GCN-NEXT: ds_read_b128 v[60:63], v57
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v160, v149
- ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134
- ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148
- ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
- ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134
- ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134
- ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134
- ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134
- ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134
- ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
- ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162
- ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163
- ; GCN-NEXT: v_exp_f32_e32 v162, v146
- ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164
- ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134
- ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147
- ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
- ; GCN-NEXT: v_exp_f32_e32 v151, v33
- ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59
- ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134
- ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134
- ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134
- ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33
- ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
- ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134
- ; GCN-NEXT: v_exp_f32_e32 v153, v33
- ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134
- ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5
- ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32
- ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161
- ; GCN-NEXT: v_exp_f32_e32 v165, v60
- ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8
- ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v161, v61
- ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8
- ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5
- ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8
- ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8
+ ; GCN-NEXT: v_perm_b32 v188, v194, v192, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[164:165], v[144:145], v[80:95]
+ ; GCN-NEXT: v_perm_b32 v189, v220, v218, s5
+ ; GCN-NEXT: v_perm_b32 v191, v220, v218, s7
+ ; GCN-NEXT: v_perm_b32 v190, v194, v192, s7
+ ; GCN-NEXT: v_perm_b32 v192, v195, v193, s5
+ ; GCN-NEXT: v_perm_b32 v194, v195, v193, s7
+ ; GCN-NEXT: v_perm_b32 v193, v221, v219, s5
+ ; GCN-NEXT: v_perm_b32 v195, v221, v219, s7
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[136:137], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[150:151], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[138:139], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[176:177], v[136:137], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[226:227], v[132:133], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[178:179], v[138:139], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[136:137], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[128:129], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[134:135], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[132:133], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[138:139], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[232:233], v[130:131], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[234:235], v[128:129], v[96:111]
+ ; GCN-NEXT: s_nop 9
+ ; GCN-NEXT: v_mul_f32_e32 v213, s4, v112
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v113
+ ; GCN-NEXT: v_max3_f32 v213, v213, s14, v218
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v114
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v115
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v116
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95]
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v117
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v118
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v119
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v120
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v121
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79]
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v122
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v123
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v124
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v125
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111]
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v126
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v127
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95]
+ ; GCN-NEXT: s_nop 6
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v96
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v97
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v98
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v99
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v100
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v101
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v102
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v103
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v104
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v105
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95]
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v106
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v107
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v108
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v109
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v110
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v111
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v140, s4, v80
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v81
+ ; GCN-NEXT: v_max3_f32 v140, v213, v140, v141
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v82
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v83
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v84
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v85
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v86
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v87
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v88
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v89
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v90
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v91
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v92
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v93
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v94
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v95
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v128, s4, v64
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v65
+ ; GCN-NEXT: v_max3_f32 v128, v140, v128, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v66
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v67
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v68
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v69
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v70
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v71
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v72
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v73
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v74
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v75
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v76
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v77
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v78
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v79
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[130:133], v198
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_max_f32_e32 v129, v129, v129
+ ; GCN-NEXT: v_max_f32_e32 v128, v128, v129
+ ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: v_cndmask_b32_e64 v128, v129, v128, s[12:13]
+ ; GCN-NEXT: v_max_f32_e32 v128, v128, v128
+ ; GCN-NEXT: v_max_f32_e32 v128, v212, v128
+ ; GCN-NEXT: v_fma_f32 v113, s4, v113, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v114, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v115, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v116, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v117, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v118, -v128
+ ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v119, -v128
+ ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128
+ ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112
+ ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120
+ ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v114, v138
+ ; GCN-NEXT: v_exp_f32_e32 v115, v139
+ ; GCN-NEXT: v_exp_f32_e32 v116, v140
+ ; GCN-NEXT: v_exp_f32_e32 v117, v141
+ ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v118
+ ; GCN-NEXT: v_exp_f32_e32 v118, v142
+ ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v120
+ ; GCN-NEXT: v_exp_f32_e32 v120, v144
+ ; GCN-NEXT: v_exp_f32_e32 v113, v112
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116
+ ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129
+ ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v122, s4, v123, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115
+ ; GCN-NEXT: v_mul_f32_e32 v151, 0x3fb8aa3b, v122
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117
+ ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121
+ ; GCN-NEXT: v_exp_f32_e32 v112, v129
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122
+ ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128
+ ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v119, v143
+ ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47]
+ ; GCN-NEXT: v_mul_f32_e64 v20, v20, v112
+ ; GCN-NEXT: v_mul_f32_e64 v21, v21, v112
+ ; GCN-NEXT: v_mul_f32_e64 v22, v22, v112
+ ; GCN-NEXT: v_mul_f32_e64 v23, v23, v112
+ ; GCN-NEXT: v_mul_f32_e64 v24, v24, v112
+ ; GCN-NEXT: v_mul_f32_e64 v25, v25, v112
+ ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pack_b32_f16 v134, v123, v124
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v119
+ ; GCN-NEXT: v_fma_f32 v124, s4, v126, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v120
+ ; GCN-NEXT: v_exp_f32_e32 v121, v148
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v122, v149
+ ; GCN-NEXT: v_pack_b32_f16 v135, v130, v126
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v124
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v121
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125
+ ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128
+ ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v123, v150
+ ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127
+ ; GCN-NEXT: v_fma_f32 v143, s4, v101, -v128
+ ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128
+ ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128
+ ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128
+ ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v124, v151
+ ; GCN-NEXT: ds_read_b128 v[130:133], v197
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v122
+ ; GCN-NEXT: v_exp_f32_e32 v96, v129
+ ; GCN-NEXT: v_fma_f32 v137, s4, v97, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139
+ ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v97, v125
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v137
+ ; GCN-NEXT: v_fma_f32 v137, s4, v98, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v137
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v124
+ ; GCN-NEXT: v_fma_f32 v135, s4, v99, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v98, v138
+ ; GCN-NEXT: v_exp_f32_e32 v99, v127
+ ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_pack_b32_f16 v127, v136, v134
+ ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15]
+ ; GCN-NEXT: v_fma_f32 v131, s4, v100, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v96
+ ; GCN-NEXT: v_exp_f32_e32 v100, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v97
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b64 v135, v[36:37]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111]
- ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5
- ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150
- ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151
- ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153
- ; GCN-NEXT: v_exp_f32_e32 v159, v33
- ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38
- ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152
- ; GCN-NEXT: v_exp_f32_e32 v152, v38
+ ; GCN-NEXT: ds_write_b64 v199, v[188:189]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v136, v[60:61]
+ ; GCN-NEXT: ds_write_b64 v200, v[190:191]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v137, v[32:33]
- ; GCN-NEXT: ; implicit-def: $vgpr33
- ; GCN-NEXT: ; implicit-def: $vgpr38
+ ; GCN-NEXT: ds_write_b64 v201, v[192:193]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v138, v[140:141]
- ; GCN-NEXT: v_add_u32_e32 v38, v132, v38
- ; GCN-NEXT: v_add_u32_e32 v33, v132, v33
+ ; GCN-NEXT: ds_write_b64 v202, v[194:195]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v101, v125
+ ; GCN-NEXT: v_pack_b32_f16 v146, v130, v131
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v143
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v98
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31]
+ ; GCN-NEXT: v_fma_f32 v134, s4, v102, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v134
+ ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; implicit-def: $vgpr36
- ; GCN-NEXT: v_add_u32_e32 v33, v132, v36
- ; GCN-NEXT: ; implicit-def: $vgpr37
- ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v102, v142
+ ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v33, v132, v37
- ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165
- ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156
- ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134
- ; GCN-NEXT: ds_read_b128 v[36:39], v139
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79]
- ; GCN-NEXT: v_exp_f32_e32 v154, v32
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158
- ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134
- ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v155, v32
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157
- ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161
- ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159
- ; GCN-NEXT: v_exp_f32_e32 v157, v32
- ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150
- ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142
- ; GCN-NEXT: v_exp_f32_e32 v146, v32
- ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134
- ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40
- ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v147, v36
- ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143
- ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v143, v36
- ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155
- ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142
- ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111]
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156
- ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157
- ; GCN-NEXT: v_exp_f32_e32 v156, v32
- ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146
- ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32
- ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
- ; GCN-NEXT: v_exp_f32_e32 v129, v36
- ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44
- ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147
- ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
- ; GCN-NEXT: ds_read_b128 v[36:39], v57
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v142, v40
- ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61
- ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143
- ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95]
- ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134
- ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150
- ; GCN-NEXT: v_exp_f32_e32 v63, v40
- ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61
- ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134
- ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156
- ; GCN-NEXT: v_exp_f32_e32 v158, v17
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129
- ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17
- ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
- ; GCN-NEXT: v_exp_f32_e32 v128, v17
- ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8
- ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16
- ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62
- ; GCN-NEXT: v_exp_f32_e32 v167, v36
- ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8
- ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v130, v37
- ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158
- ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5
- ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8
- ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v99
+ ; GCN-NEXT: v_fma_f32 v127, s4, v103, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v103, v150
+ ; GCN-NEXT: v_fma_f32 v139, s4, v105, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v147, v147, v126
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v127
+ ; GCN-NEXT: v_perm_b32 v152, v135, v131, s5
+ ; GCN-NEXT: v_perm_b32 v154, v135, v131, s7
+ ; GCN-NEXT: v_fma_f32 v135, s4, v104, -v128
+ ; GCN-NEXT: v_perm_b32 v126, v134, v130, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15]
+ ; GCN-NEXT: v_perm_b32 v150, v134, v130, s7
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v100
+ ; GCN-NEXT: v_exp_f32_e32 v104, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v101
+ ; GCN-NEXT: ds_read_b128 v[130:133], v198
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_perm_b32 v127, v144, v142, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47]
+ ; GCN-NEXT: v_pack_b32_f16 v148, v134, v135
+ ; GCN-NEXT: v_fma_f32 v135, s4, v106, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v105, v125
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v102
+ ; GCN-NEXT: v_perm_b32 v151, v144, v142, s7
+ ; GCN-NEXT: v_perm_b32 v153, v145, v143, s5
+ ; GCN-NEXT: v_perm_b32 v155, v145, v143, s7
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v106, v156
+ ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v103
+ ; GCN-NEXT: v_fma_f32 v136, s4, v107, -v128
+ ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v139
+ ; GCN-NEXT: v_pack_b32_f16 v149, v134, v135
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63]
+ ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v136
+ ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v107, v138
+ ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[148:149], v[0:15]
+ ; GCN-NEXT: v_fma_f32 v131, s4, v108, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v104
+ ; GCN-NEXT: v_exp_f32_e32 v108, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v105
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47]
+ ; GCN-NEXT: v_fma_f32 v142, s4, v109, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v109, v125
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v142
+ ; GCN-NEXT: v_pack_b32_f16 v142, v130, v131
+ ; GCN-NEXT: v_fma_f32 v131, s4, v110, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v106
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31]
+ ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v107
+ ; GCN-NEXT: v_exp_f32_e32 v110, v156
+ ; GCN-NEXT: v_fma_f32 v135, s4, v111, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_pack_b32_f16 v143, v130, v131
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v111, v146
+ ; GCN-NEXT: v_fma_f32 v139, s4, v80, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v108
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v80, v129
+ ; GCN-NEXT: ds_read_b128 v[130:133], v197
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v109
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47]
+ ; GCN-NEXT: v_fma_f32 v144, s4, v81, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v81, v125
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v144
+ ; GCN-NEXT: v_pack_b32_f16 v144, v138, v139
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v110
+ ; GCN-NEXT: v_fma_f32 v137, s4, v82, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v82, v134
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v111
+ ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v137
+ ; GCN-NEXT: v_fma_f32 v137, s4, v83, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v157, 0x3fb8aa3b, v137
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v83, v135
+ ; GCN-NEXT: v_pack_b32_f16 v145, v136, v134
+ ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b64 v135, v[20:21]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
- ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5
- ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128
- ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150
- ; GCN-NEXT: v_exp_f32_e32 v140, v17
- ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5
+ ; GCN-NEXT: ds_write_b64 v199, v[126:127]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v136, v[36:37]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22
- ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60
- ; GCN-NEXT: v_exp_f32_e32 v144, v22
+ ; GCN-NEXT: ds_write_b64 v200, v[150:151]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v137, v[16:17]
- ; GCN-NEXT: ; implicit-def: $vgpr17
- ; GCN-NEXT: ; implicit-def: $vgpr22
+ ; GCN-NEXT: ds_write_b64 v201, v[152:153]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v138, v[42:43]
- ; GCN-NEXT: v_add_u32_e32 v22, v132, v22
- ; GCN-NEXT: v_add_u32_e32 v17, v132, v17
- ; GCN-NEXT: ; implicit-def: $vgpr20
- ; GCN-NEXT: ; implicit-def: $vgpr21
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: ds_write_b64 v202, v[154:155]
+ ; GCN-NEXT: v_fma_f32 v127, s4, v84, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v84, v129
+ ; GCN-NEXT: v_fma_f32 v130, s4, v85, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v80
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v127
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v85, v125
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v130
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v81
+ ; GCN-NEXT: v_pack_b32_f16 v126, v126, v127
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31]
+ ; GCN-NEXT: v_fma_f32 v134, s4, v86, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v134
+ ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v203, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v20, v132, v20
- ; GCN-NEXT: v_add_u32_e32 v21, v132, v21
- ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44
- ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
- ; GCN-NEXT: v_exp_f32_e32 v132, v16
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v82
+ ; GCN-NEXT: v_exp_f32_e32 v86, v156
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v83
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167
- ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134
- ; GCN-NEXT: ds_read_b128 v[20:23], v139
+ ; GCN-NEXT: v_fma_f32 v139, s4, v87, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v87, v157
+ ; GCN-NEXT: v_pack_b32_f16 v127, v127, v138
+ ; GCN-NEXT: v_fma_f32 v138, s4, v89, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v139
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15]
+ ; GCN-NEXT: ; implicit-def: $sgpr0
+ ; GCN-NEXT: v_perm_b32 v154, v135, v131, s5
+ ; GCN-NEXT: v_perm_b32 v156, v135, v131, s7
+ ; GCN-NEXT: v_fma_f32 v135, s4, v88, -v128
+ ; GCN-NEXT: v_perm_b32 v150, v134, v130, s5
+ ; GCN-NEXT: v_perm_b32 v152, v134, v130, s7
+ ; GCN-NEXT: ds_read_b128 v[130:133], v198
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v84
+ ; GCN-NEXT: v_exp_f32_e32 v88, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v85
+ ; GCN-NEXT: v_perm_b32 v151, v146, v142, s5
+ ; GCN-NEXT: v_perm_b32 v153, v146, v142, s7
+ ; GCN-NEXT: v_perm_b32 v155, v147, v143, s5
+ ; GCN-NEXT: v_perm_b32 v157, v147, v143, s7
+ ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v89, v125
+ ; GCN-NEXT: v_pack_b32_f16 v146, v134, v135
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v86
+ ; GCN-NEXT: v_fma_f32 v135, s4, v90, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v138
+ ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v90, v158
+ ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v64
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v87
+ ; GCN-NEXT: v_fma_f32 v127, s4, v91, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v91, v139
+ ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127
+ ; GCN-NEXT: v_pack_b32_f16 v147, v134, v126
+ ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
+ ; GCN-NEXT: v_fma_f32 v130, s4, v92, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v88
+ ; GCN-NEXT: v_exp_f32_e32 v92, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v130
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v89
+ ; GCN-NEXT: v_fma_f32 v131, s4, v93, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v130, v126, v130
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v93, v125
+ ; GCN-NEXT: v_fma_f32 v126, s4, v94, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v125, v90
+ ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v126
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v91
+ ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_fma_f32 v131, s4, v95, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v94, v148
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v95, v127
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v92
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_pack_b32_f16 v131, v125, v126
+ ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v125, v129
+ ; GCN-NEXT: ds_read_b128 v[132:135], v197
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[36:39], v139 offset:576
+ ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47]
+ ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_fma_f32 v65, s4, v66, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v126, v142
+ ; GCN-NEXT: v_pack_b32_f16 v142, v127, v64
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v94
+ ; GCN-NEXT: v_mul_f32_e32 v145, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v95
+ ; GCN-NEXT: v_fma_f32 v66, s4, v67, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v127, v143
+ ; GCN-NEXT: v_pack_b32_f16 v143, v64, v65
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v129, v138
+ ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v66
+ ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[136:139], v197 offset:1728
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v62, v16
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130
- ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134
- ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134
- ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134
- ; GCN-NEXT: ; implicit-def: $sgpr0
- ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140
- ; GCN-NEXT: v_exp_f32_e32 v145, v16
- ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141
- ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46
- ; GCN-NEXT: v_exp_f32_e32 v35, v16
- ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v34, s4, v27, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v24
- ; GCN-NEXT: ds_read_b128 v[24:27], v139 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v46, v20
- ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v47
- ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v132
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v47, v20
- ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v62
- ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v34
- ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111]
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145
- ; GCN-NEXT: v_exp_f32_e32 v141, v16
- ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35
- ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134
- ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16
- ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127]
- ; GCN-NEXT: v_exp_f32_e32 v33, v20
- ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28
- ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
- ; GCN-NEXT: ds_read_b128 v[20:23], v57
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v36, v24
- ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37
- ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47
- ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
- ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_exp_f32_e32 v39, v24
- ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37
- ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141
- ; GCN-NEXT: v_exp_f32_e32 v148, v1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33
- ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127]
- ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134
- ; GCN-NEXT: v_exp_f32_e32 v34, v1
- ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8
- ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0
- ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38
- ; GCN-NEXT: v_exp_f32_e32 v150, v20
- ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8
- ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v38, v21
- ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39
- ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134
- ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5
- ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b64 v135, v[4:5]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111]
- ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5
- ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_exp_f32_e32 v42, v1
- ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5
+ ; GCN-NEXT: ds_write_b64 v199, v[150:151]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v136, v[20:21]
+ ; GCN-NEXT: ds_write_b64 v200, v[152:153]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v125
+ ; GCN-NEXT: v_exp_f32_e32 v130, v158
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v137, v[0:1]
+ ; GCN-NEXT: ds_write_b64 v201, v[154:155]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v138, v[26:27]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6
- ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32
+ ; GCN-NEXT: ds_write_b64 v202, v[156:157]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_pack_b32_f16 v16, v37, v28
- ; GCN-NEXT: v_fma_f32 v24, s4, v7, -v134
- ; GCN-NEXT: v_exp_f32_e32 v25, v6
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[4:7], v139
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149
- ; GCN-NEXT: v_exp_f32_e32 v26, v0
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150
- ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38
- ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v28, s4, v9, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v29, v0
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v41
- ; GCN-NEXT: v_fma_f32 v30, s4, v10, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v42
- ; GCN-NEXT: v_exp_f32_e32 v31, v0
- ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v17, v2, v0
- ; GCN-NEXT: v_pack_b32_f16 v16, v1, v27
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24
- ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134
- ; GCN-NEXT: v_exp_f32_e32 v19, v0
- ; GCN-NEXT: ds_read_b128 v[0:3], v139 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8
- ; GCN-NEXT: ds_read_b128 v[8:11], v139 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v24, v4
- ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v28
- ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v26
- ; GCN-NEXT: v_exp_f32_e32 v27, v4
- ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29
- ; GCN-NEXT: v_fma_f32 v21, s4, v13, -v134
- ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111]
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31
- ; GCN-NEXT: v_exp_f32_e32 v30, v0
- ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19
- ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
- ; GCN-NEXT: v_exp_f32_e32 v16, v4
- ; GCN-NEXT: v_pack_b32_f16 v0, v5, v20
- ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v12
- ; GCN-NEXT: v_exp_f32_e32 v18, v9
- ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21
- ; GCN-NEXT: v_exp_f32_e32 v21, v9
- ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
- ; GCN-NEXT: ds_read_b128 v[4:7], v57
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24
- ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21
- ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30
- ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28
- ; GCN-NEXT: v_exp_f32_e32 v2, v2
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v8
- ; GCN-NEXT: v_exp_f32_e32 v10, v1
- ; GCN-NEXT: v_pack_b32_f16 v8, v17, v20
- ; GCN-NEXT: v_pack_b32_f16 v9, v3, v0
- ; GCN-NEXT: v_add_f32_e32 v3, 0, v49
- ; GCN-NEXT: v_add_f32_e32 v3, v50, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v51, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v52, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v53, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v54, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v55, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v56, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v58, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v163, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v164, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v59, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v160, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v162, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v151, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v153, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v165, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v161, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v159, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v152, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v154, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v155, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v157, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v146, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v147, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v143, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v156, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v129, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v142, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v63, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v158, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v128, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v167, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v130, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v140, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v144, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v132, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v62, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v145, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v35, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v46, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v47, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v141, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v33, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v36, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v39, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v148, v3
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
- ; GCN-NEXT: v_add_f32_e32 v3, v34, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v150, v3
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10
- ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2
- ; GCN-NEXT: v_add_f32_e32 v3, v38, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v42, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v25, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v26, v3
- ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1
- ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22
- ; GCN-NEXT: v_add_f32_e32 v3, v29, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v31, v3
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
- ; GCN-NEXT: v_add_f32_e32 v3, v19, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v24, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v27, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v30, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v16, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v18, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v21, v3
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
- ; GCN-NEXT: v_add_f32_e32 v0, v2, v3
- ; GCN-NEXT: v_add_f32_e32 v4, v10, v0
- ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47]
+ ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v126
+ ; GCN-NEXT: v_exp_f32_e32 v131, v144
+ ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v69
+ ; GCN-NEXT: v_fma_f32 v69, s4, v71, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v140, v132, v68
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v129
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v127
+ ; GCN-NEXT: v_exp_f32_e32 v132, v145
+ ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_fma_f32 v145, s4, v73, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v145
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v133, v141
+ ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v69
+ ; GCN-NEXT: v_pack_b32_f16 v141, v64, v68
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[68:71], v198
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v143, s4, v72, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v130
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v72, v146
+ ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v143
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v143, v131
+ ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_pack_b32_f16 v64, v64, v143
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v73, v144
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v132
+ ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v74, v65
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v133
+ ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+ ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63]
+ ; GCN-NEXT: v_fma_f32 v138, s4, v75, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v75, v142
+ ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v138
+ ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v72
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
+ ; GCN-NEXT: v_fma_f32 v68, s4, v76, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v76, v146
+ ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v73
+ ; GCN-NEXT: v_fma_f32 v69, s4, v77, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v77, v147
+ ; GCN-NEXT: v_pack_b32_f16 v134, v66, v68
+ ; GCN-NEXT: v_fma_f32 v68, s4, v78, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v74
+ ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v69
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v78, v67
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v68
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v76
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v75
+ ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v79, v148
+ ; GCN-NEXT: v_mul_f32_e32 v128, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_pack_b32_f16 v135, v66, v64
+ ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v142, v146
+ ; GCN-NEXT: ds_read_b128 v[68:71], v197
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v137, v147
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v77
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v138, v138
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v78
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
+ ; GCN-NEXT: s_nop 10
+ ; GCN-NEXT: v_exp_f32_e32 v52, v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v137
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v142
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v138
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v52
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v79
+ ; GCN-NEXT: v_pack_b32_f16 v50, v51, v50
+ ; GCN-NEXT: v_pack_b32_f16 v48, v139, v136
+ ; GCN-NEXT: v_pack_b32_f16 v51, v54, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, 0, v113
+ ; GCN-NEXT: v_add_f32_e32 v53, v114, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v115, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v116, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v117, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v118, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v119, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v120, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v121, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v122, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v123, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v124, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v96, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v97, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v98, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v99, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v100, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v101, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v102, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v103, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v104, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v105, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v106, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v107, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v108, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v109, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v110, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v111, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v80, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v81, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v82, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v83, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v84, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v85, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v86, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v87, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v88, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v89, v53
+ ; GCN-NEXT: v_pack_b32_f16 v49, v140, v49
+ ; GCN-NEXT: v_add_f32_e32 v53, v90, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v91, v53
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v53, v92, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v93, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v94, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v95, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v125, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v126, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v127, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v129, v53
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47]
+ ; GCN-NEXT: s_nop 9
+ ; GCN-NEXT: v_add_f32_e32 v0, v130, v53
+ ; GCN-NEXT: v_add_f32_e32 v0, v131, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v132, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v133, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v72, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v73, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v74, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v75, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v76, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v77, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v78, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v79, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v142, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v137, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v138, v0
+ ; GCN-NEXT: v_add_f32_e32 v4, v52, v0
+ ; GCN-NEXT: ds_bpermute_b32 v5, v196, v4
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31]
; GCN-NEXT: v_add_f32_e32 v2, v4, v5
- ; GCN-NEXT: ds_bpermute_b32 v3, v133, v2
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111]
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7]
+ ; GCN-NEXT: ds_bpermute_b32 v3, v196, v2
; GCN-NEXT: ; implicit-def: $vgpr4
- ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48
- ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[12:13]
+ ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v112
+ ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1728
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47]
; GCN-NEXT: s_endpgm
attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 7959cee..e174fc1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -156,62 +156,62 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-NEXT: v_mov_b32_e32 v2, 1.0
-; GCN-NEXT: v_mov_b32_e32 v3, 2.0
+; GCN-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_add_u32_e32 v1, s0, v0
-; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112
-; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96
-; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80
-; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64
-; GCN-NEXT: ds_read_b128 a[0:3], v1
-; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16
-; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32
-; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48
+; GCN-NEXT: v_add_u32_e32 v3, s0, v0
+; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:112
+; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:96
+; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:80
+; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:64
+; GCN-NEXT: ds_read_b128 a[0:3], v3
+; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:16
+; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:32
+; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:48
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304
-; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288
-; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272
-; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256
-; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240
-; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224
-; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208
-; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:8304
+; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:8288
+; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:8272
+; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:8256
+; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:8240
+; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:8224
+; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:8208
+; GCN-NEXT: ds_read_b128 a[128:131], v3 offset:8192
+; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
; GCN-NEXT: ; iglp_opt mask(0x00000001)
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
-; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688
-; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672
-; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656
-; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640
-; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624
-; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608
-; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592
-; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
+; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688
+; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672
+; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656
+; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640
+; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624
+; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608
+; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592
+; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
-; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264
-; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248
-; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232
-; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216
-; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200
-; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184
-; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168
-; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152
-; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264
+; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248
+; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232
+; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216
+; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200
+; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184
+; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168
+; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
-; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456
-; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440
-; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424
-; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408
-; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344
-; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360
-; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376
-; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456
+; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440
+; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424
+; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408
+; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344
+; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360
+; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376
+; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index aa099b6..b65a1a8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -623,62 +623,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 1.0
+; GCN-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_add_u32_e32 v1, s0, v0
-; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112
-; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96
-; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80
-; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64
-; GCN-NEXT: ds_read_b128 a[128:131], v1
-; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16
-; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32
-; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48
-; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304
-; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288
-; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272
-; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256
-; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240
-; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224
-; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208
-; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192
-; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1
-; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688
-; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672
-; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656
-; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640
-; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624
-; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608
-; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592
-; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576
-; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264
-; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248
-; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232
-; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216
-; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200
-; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184
-; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168
-; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152
-; GCN-NEXT: v_mov_b32_e32 v1, 1.0
-; GCN-NEXT: ds_read_b128 a[60:63], v2 offset:57456
-; GCN-NEXT: ds_read_b128 a[56:59], v2 offset:57440
-; GCN-NEXT: ds_read_b128 a[52:55], v2 offset:57424
-; GCN-NEXT: ds_read_b128 a[48:51], v2 offset:57408
-; GCN-NEXT: ds_read_b128 a[32:35], v2 offset:57344
-; GCN-NEXT: ds_read_b128 a[36:39], v2 offset:57360
-; GCN-NEXT: ds_read_b128 a[40:43], v2 offset:57376
-; GCN-NEXT: ds_read_b128 a[44:47], v2 offset:57392
-; GCN-NEXT: v_mov_b32_e32 v2, 2.0
+; GCN-NEXT: v_add_u32_e32 v3, s0, v0
+; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:112
+; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:96
+; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:80
+; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:64
+; GCN-NEXT: ds_read_b128 a[128:131], v3
+; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:16
+; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32
+; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48
+; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3
+; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:8304
+; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:8288
+; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:8272
+; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:8256
+; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:8240
+; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:8224
+; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:8208
+; GCN-NEXT: ds_read_b128 a[0:3], v3 offset:8192
+; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688
+; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672
+; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656
+; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640
+; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624
+; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608
+; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592
+; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576
+; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264
+; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248
+; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232
+; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216
+; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200
+; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184
+; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168
+; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152
+; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456
+; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440
+; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424
+; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408
+; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344
+; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360
+; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376
+; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392
+; GCN-NEXT: s_waitcnt lgkmcnt(14)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
-; GCN-NEXT: s_waitcnt lgkmcnt(14)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
; GCN-NEXT: s_waitcnt lgkmcnt(8)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; GCN-NEXT: s_nop 12
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
+; GCN-NEXT: s_nop 11
; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112
; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96
; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80
@@ -729,62 +729,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 2.0
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0
-; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112
-; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96
-; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80
-; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64
-; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1
-; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16
-; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32
-; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192
-; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1
-; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:24688
-; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:24672
-; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:24656
-; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:24640
-; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:24624
-; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:24608
-; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:24592
-; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:24576
-; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:49264
-; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:49248
-; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:49232
-; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:49216
-; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:49200
-; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:49184
-; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:49168
-; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:49152
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 1.0
-; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v2 offset:57456
-; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v2 offset:57440
-; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v2 offset:57424
-; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v2 offset:57408
-; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v2 offset:57344
-; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v2 offset:57360
-; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v2 offset:57376
-; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v2 offset:57392
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0
+; EXACTCUTOFF-NEXT: v_add_u32_e32 v3, s0, v0
+; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v3 offset:112
+; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v3 offset:96
+; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v3 offset:80
+; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v3 offset:64
+; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v3
+; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v3 offset:16
+; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v3 offset:32
+; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v3 offset:48
+; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v3
+; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v3 offset:8304
+; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v3 offset:8288
+; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v3 offset:8272
+; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v3 offset:8256
+; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v3 offset:8240
+; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v3 offset:8224
+; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v3 offset:8208
+; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v3 offset:8192
+; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:24688
+; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:24672
+; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:24656
+; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:24640
+; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:24624
+; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:24608
+; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:24592
+; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:24576
+; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:49264
+; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:49248
+; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:49232
+; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v3 offset:49216
+; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v3 offset:49200
+; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:49184
+; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:49168
+; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:49152
+; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v4 offset:57456
+; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v4 offset:57440
+; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v4 offset:57424
+; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v4 offset:57408
+; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v4 offset:57344
+; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v4 offset:57360
+; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v4 offset:57376
+; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v4 offset:57392
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14)
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; EXACTCUTOFF-NEXT: s_nop 12
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
+; EXACTCUTOFF-NEXT: s_nop 11
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 56f9c5d..d578d2e 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -612,10 +612,10 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
; GFX1250-NEXT: v_med3_num_f32 v2, v2, v3, v4
; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
- %tmp0 = call float @llvm.minnum.f32(float %x, float %y)
- %tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
- %tmp3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %x, float %y)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %x, float %y)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %z)
+ %tmp3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %tmp3, ptr addrspace(1) %arg
ret void
}
@@ -646,10 +646,10 @@ define void @test_med3_minimumnum_maximumnum_f32(ptr addrspace(1) %arg, float %x
; GFX1250-NEXT: v_med3_num_f32 v2, v2, v3, v4
; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
- %tmp0 = call float @llvm.minimumnum.f32(float %x, float %y)
- %tmp1 = call float @llvm.maximumnum.f32(float %x, float %y)
- %tmp2 = call float @llvm.minimumnum.f32(float %tmp1, float %z)
- %tmp3 = call float @llvm.maximumnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minimumnum.f32(float %x, float %y)
+ %tmp1 = call nnan float @llvm.maximumnum.f32(float %x, float %y)
+ %tmp2 = call nnan float @llvm.minimumnum.f32(float %tmp1, float %z)
+ %tmp3 = call nnan float @llvm.maximumnum.f32(float %tmp0, float %tmp2)
store float %tmp3, ptr addrspace(1) %arg
ret void
}
@@ -1280,10 +1280,10 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
; GISEL-GFX1250-FAKE16-NEXT: v_med3_num_f16 v2, v2, v3, v4
; GISEL-GFX1250-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
; GISEL-GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
- %tmp0 = call half @llvm.minnum.f16(half %x, half %y)
- %tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
- %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
- %tmp3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
+ %tmp0 = call nnan half @llvm.minnum.f16(half %x, half %y)
+ %tmp1 = call nnan half @llvm.maxnum.f16(half %x, half %y)
+ %tmp2 = call nnan half @llvm.minnum.f16(half %tmp1, half %z)
+ %tmp3 = call nnan half @llvm.maxnum.f16(half %tmp0, half %tmp2)
store half %tmp3, ptr addrspace(1) %arg
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
index ddbae64..a95d8c7 100644
--- a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
@@ -1,8 +1,8 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck %s -check-prefixes=GCN,GFX700
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
declare i64 @llvm.readsteadycounter() #0
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 9a23788..8803f3a 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -367,77 +367,76 @@ bb:
define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
; CHECK-LABEL: illegal_mfma_after_rewrite:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_mov_b32 s5, s4
+; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[4:5]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[16:19]
+; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3]
+; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_accvgpr_write_b32 a0, s0
-; CHECK-NEXT: v_accvgpr_write_b32 a1, s1
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7]
-; CHECK-NEXT: s_nop 2
-; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; CHECK-NEXT: v_mov_b32_e32 v5, v4
-; CHECK-NEXT: v_mov_b32_e32 v6, v4
-; CHECK-NEXT: v_mov_b32_e32 v7, v4
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7]
+; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9]
+; CHECK-NEXT: s_nop 3
+; CHECK-NEXT: v_cvt_f16_f32_e32 v24, v4
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3]
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def v[4:7]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7]
-; CHECK-NEXT: s_nop 5
-; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15]
-; CHECK-NEXT: s_nop 2
-; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3]
-; CHECK-NEXT: global_store_short v[12:13], v17, off
+; CHECK-NEXT: v_mov_b32_e32 v8, 0x7fc00000
+; CHECK-NEXT: v_mov_b32_e32 v9, v8
+; CHECK-NEXT: v_mov_b32_e32 v10, v8
+; CHECK-NEXT: v_mov_b32_e32 v11, v8
+; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v6
+; CHECK-NEXT: v_mov_b64_e32 v[0:1], 0
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11]
+; CHECK-NEXT: global_store_short v[0:1], v2, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT: global_store_short v[12:13], v9, off
-; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19]
+; CHECK-NEXT: s_nop 5
+; CHECK-NEXT: v_cvt_f16_f32_e32 v10, v6
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15]
+; CHECK-NEXT: global_store_short v[0:1], v10, off
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5]
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0
-; CHECK-NEXT: global_store_short v[12:13], v1, off
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23]
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CHECK-NEXT: global_store_short v[0:1], v6, off
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23]
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: global_store_short v[12:13], v14, off
+; CHECK-NEXT: global_store_short v[0:1], v24, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5]
; CHECK-NEXT: s_nop 6
-; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7]
-; CHECK-NEXT: global_store_short v[12:13], v8, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v2
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19]
+; CHECK-NEXT: global_store_short v[0:1], v6, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
; CHECK-NEXT: s_nop 2
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT: global_store_short v[12:13], v0, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CHECK-NEXT: global_store_short v[0:1], v2, off
; CHECK-NEXT: s_endpgm
entry:
%k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
@@ -546,100 +545,14 @@ define void @test_rewrite_mfma_subreg_insert2(double %arg0, double %arg1, ptr ad
define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) #0 {
; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class:
; CHECK: ; %bb.0:
+; CHECK-NEXT: v_accvgpr_write_b32 a34, 2.0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_write_b32 a32, v0
-; CHECK-NEXT: v_accvgpr_read_b32 v63, a31
-; CHECK-NEXT: v_accvgpr_read_b32 v62, a30
-; CHECK-NEXT: v_accvgpr_read_b32 v61, a29
-; CHECK-NEXT: v_accvgpr_read_b32 v60, a28
-; CHECK-NEXT: v_accvgpr_read_b32 v59, a27
-; CHECK-NEXT: v_accvgpr_read_b32 v58, a26
-; CHECK-NEXT: v_accvgpr_read_b32 v57, a25
-; CHECK-NEXT: v_accvgpr_read_b32 v56, a24
-; CHECK-NEXT: v_accvgpr_read_b32 v55, a23
-; CHECK-NEXT: v_accvgpr_read_b32 v54, a22
-; CHECK-NEXT: v_accvgpr_read_b32 v53, a21
-; CHECK-NEXT: v_accvgpr_read_b32 v52, a20
-; CHECK-NEXT: v_accvgpr_read_b32 v51, a19
-; CHECK-NEXT: v_accvgpr_read_b32 v50, a18
-; CHECK-NEXT: v_accvgpr_read_b32 v49, a17
-; CHECK-NEXT: v_accvgpr_read_b32 v48, a16
-; CHECK-NEXT: v_accvgpr_read_b32 v47, a15
-; CHECK-NEXT: v_accvgpr_read_b32 v46, a14
-; CHECK-NEXT: v_accvgpr_read_b32 v45, a13
-; CHECK-NEXT: v_accvgpr_read_b32 v44, a12
-; CHECK-NEXT: v_accvgpr_read_b32 v43, a11
-; CHECK-NEXT: v_accvgpr_read_b32 v42, a10
-; CHECK-NEXT: v_accvgpr_read_b32 v41, a9
-; CHECK-NEXT: v_accvgpr_read_b32 v40, a8
-; CHECK-NEXT: v_accvgpr_read_b32 v39, a7
-; CHECK-NEXT: v_accvgpr_read_b32 v38, a6
-; CHECK-NEXT: v_accvgpr_read_b32 v37, a5
-; CHECK-NEXT: v_accvgpr_read_b32 v36, a4
-; CHECK-NEXT: v_accvgpr_read_b32 v35, a3
-; CHECK-NEXT: v_accvgpr_read_b32 v34, a2
-; CHECK-NEXT: v_accvgpr_read_b32 v33, a1
-; CHECK-NEXT: v_accvgpr_read_b32 v32, a0
-; CHECK-NEXT: v_accvgpr_write_b32 a0, 2.0
-; CHECK-NEXT: v_accvgpr_write_b32 a1, 4.0
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[32:63]
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v32
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v33
-; CHECK-NEXT: v_accvgpr_write_b32 a2, v34
-; CHECK-NEXT: v_accvgpr_write_b32 a3, v35
-; CHECK-NEXT: v_accvgpr_write_b32 a4, v36
-; CHECK-NEXT: v_accvgpr_write_b32 a5, v37
-; CHECK-NEXT: v_accvgpr_write_b32 a6, v38
-; CHECK-NEXT: v_accvgpr_write_b32 a7, v39
-; CHECK-NEXT: v_accvgpr_write_b32 a8, v40
-; CHECK-NEXT: v_accvgpr_write_b32 a9, v41
-; CHECK-NEXT: v_accvgpr_write_b32 a10, v42
-; CHECK-NEXT: v_accvgpr_write_b32 a11, v43
-; CHECK-NEXT: v_accvgpr_write_b32 a12, v44
-; CHECK-NEXT: v_accvgpr_write_b32 a13, v45
-; CHECK-NEXT: v_accvgpr_write_b32 a14, v46
-; CHECK-NEXT: v_accvgpr_write_b32 a15, v47
-; CHECK-NEXT: v_accvgpr_write_b32 a16, v48
-; CHECK-NEXT: v_accvgpr_write_b32 a17, v49
-; CHECK-NEXT: v_accvgpr_write_b32 a18, v50
-; CHECK-NEXT: v_accvgpr_write_b32 a19, v51
-; CHECK-NEXT: v_accvgpr_write_b32 a20, v52
-; CHECK-NEXT: v_accvgpr_write_b32 a21, v53
-; CHECK-NEXT: v_accvgpr_write_b32 a22, v54
-; CHECK-NEXT: v_accvgpr_write_b32 a23, v55
-; CHECK-NEXT: v_accvgpr_write_b32 a24, v56
-; CHECK-NEXT: v_accvgpr_write_b32 a25, v57
-; CHECK-NEXT: v_accvgpr_write_b32 a26, v58
-; CHECK-NEXT: v_accvgpr_write_b32 a27, v59
-; CHECK-NEXT: v_accvgpr_write_b32 a28, v60
-; CHECK-NEXT: v_accvgpr_write_b32 a29, v61
-; CHECK-NEXT: v_accvgpr_write_b32 a30, v62
-; CHECK-NEXT: v_accvgpr_write_b32 a31, v63
-; CHECK-NEXT: v_mov_b32_e32 v33, 0x41000000
-; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000
-; CHECK-NEXT: v_accvgpr_read_b32 v32, a32
-; CHECK-NEXT: v_and_b32_e32 v32, 0x3ff, v32
-; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31]
-; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v32
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
-; CHECK-NEXT: s_nop 7
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT: v_accvgpr_read_b32 v24, a24
-; CHECK-NEXT: v_accvgpr_read_b32 v25, a25
-; CHECK-NEXT: v_accvgpr_read_b32 v26, a26
-; CHECK-NEXT: v_accvgpr_read_b32 v27, a27
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
@@ -663,18 +576,60 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add
; CHECK-NEXT: v_accvgpr_read_b32 v21, a21
; CHECK-NEXT: v_accvgpr_read_b32 v22, a22
; CHECK-NEXT: v_accvgpr_read_b32 v23, a23
+; CHECK-NEXT: v_accvgpr_read_b32 v24, a24
+; CHECK-NEXT: v_accvgpr_read_b32 v25, a25
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a26
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a27
; CHECK-NEXT: v_accvgpr_read_b32 v28, a28
; CHECK-NEXT: v_accvgpr_read_b32 v29, a29
; CHECK-NEXT: v_accvgpr_read_b32 v30, a30
; CHECK-NEXT: v_accvgpr_read_b32 v31, a31
-; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3]
-; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; CHECK-NEXT: v_accvgpr_write_b32 a33, 4.0
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a34, a33, v[0:31]
+; CHECK-NEXT: v_mov_b32_e32 v1, 0x41000000
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a32
+; CHECK-NEXT: s_nop 15
+; CHECK-NEXT: v_mov_b64_e32 v[2:3], v[32:33]
+; CHECK-NEXT: v_mov_b64_e32 v[4:5], v[34:35]
+; CHECK-NEXT: v_mov_b64_e32 v[6:7], v[36:37]
+; CHECK-NEXT: v_mov_b64_e32 v[8:9], v[38:39]
+; CHECK-NEXT: v_mov_b64_e32 v[10:11], v[40:41]
+; CHECK-NEXT: v_mov_b64_e32 v[12:13], v[42:43]
+; CHECK-NEXT: v_mov_b64_e32 v[14:15], v[44:45]
+; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[46:47]
+; CHECK-NEXT: v_mov_b64_e32 v[18:19], v[48:49]
+; CHECK-NEXT: v_mov_b64_e32 v[20:21], v[50:51]
+; CHECK-NEXT: v_mov_b64_e32 v[22:23], v[52:53]
+; CHECK-NEXT: v_mov_b64_e32 v[24:25], v[54:55]
+; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[56:57]
+; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[58:59]
+; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[60:61]
+; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[62:63]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112
+; CHECK-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96
+; CHECK-NEXT: global_store_dwordx4 v0, v[22:25], s[0:1] offset:80
+; CHECK-NEXT: global_store_dwordx4 v0, v[18:21], s[0:1] offset:64
+; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_mov_b32_e32 v2, 0x41800000
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; CHECK-NEXT: s_nop 15
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[2:3] offset:96
+; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[2:3] offset:112
+; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[2:3] offset:64
+; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[2:3] offset:80
+; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
+; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
+; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
+; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
; CHECK-NEXT: s_endpgm
%src2 = call <32 x float> asm sideeffect "; def $0", "=a"()
%mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir
new file mode 100644
index 0000000..33b2f69
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir
@@ -0,0 +1,32 @@
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler --misched-prera-direction=topdown -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Check that cycle counts are consistent with hazards.
+
+# CHECK: Cycle: 3 TopQ.A
+# CHECK: hazard: SU(6) HWXDL[0]=9c, is later than CurrCycle = 3c
+# CHECK-NOT: Cycle: 9 TopQ.A
+# CHECK: Cycle: 83 TopQ.A
+# CHECK: Checking pending node SU(6)
+# CHECK: Move SU(6) into Available Q
+
+---
+name: pending_queue_ready_cycle
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr4_sgpr5
+
+ %2:sgpr_128 = IMPLICIT_DEF
+ %14:vgpr_32 = IMPLICIT_DEF
+ %15:vgpr_32 = IMPLICIT_DEF
+ %18:areg_512 = IMPLICIT_DEF
+ %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec
+ %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, implicit $exec
+ %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec
+ undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %5.sub0, %14, implicit $exec
+ %7:vreg_512 = COPY %18
+ SCHED_BARRIER 0
+ S_NOP 0, implicit %18, implicit %7, implicit %84
+ S_ENDPGM 0
+...