aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll (renamed from llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll)6
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll (renamed from llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll)2
-rw-r--r--llvm/test/CodeGen/AArch64/sve-fp-reduce.ll178
-rw-r--r--llvm/test/CodeGen/AArch64/sve-int-reduce.ll125
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir1854
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll96
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll204
-rw-r--r--llvm/test/CodeGen/AMDGPU/readsteadycounter.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll239
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir32
-rw-r--r--llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll12
-rw-r--r--llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll12
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll20
-rw-r--r--llvm/test/CodeGen/DirectX/Metadata/cbuffer-layouttype.ll (renamed from llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll)3
-rw-r--r--llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll89
-rw-r--r--llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll2
-rw-r--r--llvm/test/CodeGen/Hexagon/swp-many-stores.mir88
-rw-r--r--llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll60
-rw-r--r--llvm/test/CodeGen/PowerPC/vector-all-ones.ll23
-rw-r--r--llvm/test/CodeGen/RISCV/idiv_large.ll2311
-rw-r--r--llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll208
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll104
-rw-r--r--llvm/test/CodeGen/X86/combine-udiv.ll18
-rw-r--r--llvm/test/CodeGen/X86/cpus-intel.ll2
-rw-r--r--llvm/test/CodeGen/X86/isel-fpclass.ll433
-rw-r--r--llvm/test/CodeGen/X86/madd.ll2
-rw-r--r--llvm/test/CodeGen/X86/min-legal-vector-width.ll57
-rw-r--r--llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll1
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll1
-rw-r--r--llvm/test/CodeGen/X86/var-permute-128.ll55
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-128.ll4
-rw-r--r--llvm/test/CodeGen/X86/vector-reduce-add-mask.ll2
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-ashr-128.ll4
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-lshr-128.ll4
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll4
39 files changed, 4596 insertions, 1670 deletions
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll
index a0f1b71..bb362d2 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll
@@ -4,7 +4,7 @@
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr128 | FileCheck %s -check-prefixes=NOZCM-FPR128-ATTR --match-full-lines
; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr128 | FileCheck %s -check-prefixes=ZCM-FPR128-ATTR --match-full-lines
-define void @zero_cycle_regmov_FPR64(double %a, double %b, double %c, double %d) {
+define void @zero_cycle_regmove_FPR64(double %a, double %b, double %c, double %d) {
entry:
; CHECK-LABEL: t:
; NOZCM-FPR128-CPU: fmov d0, d2
@@ -45,7 +45,7 @@ entry:
declare float @foo_double(double, double)
-define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) {
+define void @zero_cycle_regmove_FPR32(float %a, float %b, float %c, float %d) {
entry:
; CHECK-LABEL: t:
; NOZCM-FPR128-CPU: fmov s0, s2
@@ -86,7 +86,7 @@ entry:
declare float @foo_float(float, float)
-define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) {
+define void @zero_cycle_regmove_FPR16(half %a, half %b, half %c, half %d) {
entry:
; CHECK-LABEL: t:
; NOZCM-FPR128-CPU: fmov s0, s2
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll
index e14e69b..d6d3f15 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll
@@ -4,7 +4,7 @@
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-gpr64 | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-gpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines
-define void @zero_cycle_regmov_GPR32(i32 %a, i32 %b, i32 %c, i32 %d) {
+define void @zero_cycle_regmove_GPR32(i32 %a, i32 %b, i32 %c, i32 %d) {
entry:
; CHECK-LABEL: t:
; NOTCPU-LINUX: mov w0, w2
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
index 15ee6a0..36655f6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
@@ -359,12 +359,177 @@ define float @fadd_reduct_reassoc_v4v8f32(<vscale x 4 x float> %a, <vscale x 8 x
ret float %r
}
+; No FMULV instruction so use knowledge about the architectural maximum size of
+; an SVE register to "scalarise" the reduction.
+
+define half @fmulv_nxv2f16(half %init, <vscale x 2 x half> %a) {
+; CHECK-LABEL: fmulv_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.h, #1.00000000
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: fmul h0, h0, h1
+; CHECK-NEXT: ret
+ %res = call fast half @llvm.vector.reduce.fmul.nxv2f16(half %init, <vscale x 2 x half> %a)
+ ret half %res
+}
+
+define half @fmulv_nxv4f16(half %init, <vscale x 4 x half> %a) {
+; CHECK-LABEL: fmulv_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.h, #1.00000000
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: fmul h0, h0, h1
+; CHECK-NEXT: ret
+ %res = call fast half @llvm.vector.reduce.fmul.nxv4f16(half %init, <vscale x 4 x half> %a)
+ ret half %res
+}
+
+define half @fmulv_nxv8f16(half %init, <vscale x 8 x half> %a) {
+; CHECK-LABEL: fmulv_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.h, #1.00000000
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: fmul h0, h0, h1
+; CHECK-NEXT: ret
+ %res = call fast half @llvm.vector.reduce.fmul.nxv8f16(half %init, <vscale x 8 x half> %a)
+ ret half %res
+}
+
+define float @fmulv_nxv2f32(float %init, <vscale x 2 x float> %a) {
+; CHECK-LABEL: fmulv_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.s, #1.00000000
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: ret
+ %res = call fast float @llvm.vector.reduce.fmul.nxv2f32(float %init, <vscale x 2 x float> %a)
+ ret float %res
+}
+
+define float @fmulv_nxv4f32(float %init, <vscale x 4 x float> %a) {
+; CHECK-LABEL: fmulv_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.s, #1.00000000
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: ret
+ %res = call fast float @llvm.vector.reduce.fmul.nxv4f32(float %init, <vscale x 4 x float> %a)
+ ret float %res
+}
+
+define double @fmulv_nxv2f64(double %init, <vscale x 2 x double> %a) {
+; CHECK-LABEL: fmulv_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.d, #1.00000000
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: fmul d0, d0, d1
+; CHECK-NEXT: ret
+ %res = call fast double @llvm.vector.reduce.fmul.nxv2f64(double %init, <vscale x 2 x double> %a)
+ ret double %res
+}
+
declare half @llvm.vector.reduce.fadd.nxv2f16(half, <vscale x 2 x half>)
declare half @llvm.vector.reduce.fadd.nxv4f16(half, <vscale x 4 x half>)
declare half @llvm.vector.reduce.fadd.nxv8f16(half, <vscale x 8 x half>)
-declare half @llvm.vector.reduce.fadd.nxv6f16(half, <vscale x 6 x half>)
-declare half @llvm.vector.reduce.fadd.nxv10f16(half, <vscale x 10 x half>)
-declare half @llvm.vector.reduce.fadd.nxv12f16(half, <vscale x 12 x half>)
declare float @llvm.vector.reduce.fadd.nxv2f32(float, <vscale x 2 x float>)
declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>)
declare float @llvm.vector.reduce.fadd.nxv8f32(float, <vscale x 8 x float>)
@@ -397,3 +562,10 @@ declare half @llvm.vector.reduce.fminimum.nxv8f16(<vscale x 8 x half>)
declare float @llvm.vector.reduce.fminimum.nxv2f32(<vscale x 2 x float>)
declare float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float>)
declare double @llvm.vector.reduce.fminimum.nxv2f64(<vscale x 2 x double>)
+
+declare half @llvm.vector.reduce.fmul.nxv2f16(half, <vscale x 2 x half>)
+declare half @llvm.vector.reduce.fmul.nxv4f16(half, <vscale x 4 x half>)
+declare half @llvm.vector.reduce.fmul.nxv8f16(half, <vscale x 8 x half>)
+declare float @llvm.vector.reduce.fmul.nxv2f32(float, <vscale x 2 x float>)
+declare float @llvm.vector.reduce.fmul.nxv4f32(float, <vscale x 4 x float>)
+declare double @llvm.vector.reduce.fmul.nxv2f64(double, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index be936f0..6fb0315 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -369,6 +369,131 @@ define i64 @smax_nxv2i64(<vscale x 2 x i64> %a) {
ret i64 %res
}
+; No MULV instruction so use knowledge about the architectural maximum size of
+; an SVE register to "scalarise" the reduction.
+
+define i8 @mulv_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: mulv_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.b, #1 // =0x1
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %res = call i8 @llvm.vector.reduce.mul.nxv16i8(<vscale x 16 x i8> %a)
+ ret i8 %res
+}
+
+define i16 @mulv_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: mulv_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.h, #1 // =0x1
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %res = call i16 @llvm.vector.reduce.mul.nxv8i16(<vscale x 8 x i16> %a)
+ ret i16 %res
+}
+
+define i32 @mulv_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: mulv_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.s, #1 // =0x1
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %a)
+ ret i32 %res
+}
+
+define i64 @mulv_nxv2i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: mulv_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, #1 // =0x1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.vector.reduce.mul.nxv2i64(<vscale x 2 x i64> %a)
+ ret i64 %res
+}
+
; Test widen vector reduce type
declare i8 @llvm.vector.reduce.smin.nxv10i8(<vscale x 10 x i8>)
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 6b09424..eee232a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -49,7 +49,6 @@ bb:
ret void
}
-; FIXME: This generates "instid1(/* invalid instid value */)".
define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
; GFX11-LABEL: f2:
; GFX11: ; %bb.0: ; %bb
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index b07dec3..689d147 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -6,1153 +6,1147 @@
define amdgpu_kernel void @largeInterleave() #0 { ret void }
; GCN-LABEL: largeInterleave:
; GCN: ; %bb.0:
- ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: ; implicit-def: $vgpr0
- ; GCN-NEXT: ; implicit-def: $vgpr2
- ; GCN-NEXT: ; implicit-def: $vgpr1
- ; GCN-NEXT: ; implicit-def: $vgpr8
- ; GCN-NEXT: ; implicit-def: $vgpr94
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ; implicit-def: $vgpr106
- ; GCN-NEXT: ; implicit-def: $vgpr132
- ; GCN-NEXT: ; implicit-def: $vgpr133
- ; GCN-NEXT: ; implicit-def: $vgpr139
- ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
- ; GCN-NEXT: ; iglp_opt mask(0x00000002)
- ; GCN-NEXT: ; implicit-def: $sgpr0
+ ; GCN-NEXT: ; implicit-def: $vgpr16
+ ; GCN-NEXT: ; implicit-def: $vgpr25
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
- ; GCN-NEXT: v_readfirstlane_b32 s7, v0
+ ; GCN-NEXT: v_readfirstlane_b32 s17, v16
+ ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: ; implicit-def: $vgpr17
+ ; GCN-NEXT: ; implicit-def: $sgpr15
; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: ; implicit-def: $sgpr5
- ; GCN-NEXT: s_nop 1
- ; GCN-NEXT: v_lshl_add_u32 v0, s7, 4, v2
- ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6
- ; GCN-NEXT: v_add_lshl_u32 v92, v0, v1, 1
- ; GCN-NEXT: v_add_u32_e32 v93, s0, v92
- ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: s_lshl_b32 s18, s17, 7
+ ; GCN-NEXT: ; implicit-def: $vgpr18
+ ; GCN-NEXT: v_add_lshl_u32 v230, v18, s18, 1
+ ; GCN-NEXT: v_lshl_add_u32 v25, s17, 4, v25
+ ; GCN-NEXT: v_mul_lo_u32 v25, v25, s6
+ ; GCN-NEXT: v_add_lshl_u32 v226, v25, v17, 1
+ ; GCN-NEXT: v_add_u32_e32 v17, s15, v226
+ ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v17, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: s_lshl_b32 s0, s7, 7
- ; GCN-NEXT: v_add_lshl_u32 v95, v8, s0, 1
- ; GCN-NEXT: v_add_u32_e32 v8, 64, v93
- ; GCN-NEXT: ; kill: killed $vgpr8
+ ; GCN-NEXT: v_add_u32_e32 v72, 64, v17
+ ; GCN-NEXT: ; implicit-def: $vgpr213
+ ; GCN-NEXT: ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155
+ ; GCN-NEXT: ; implicit-def: $vgpr246
+ ; GCN-NEXT: v_add_u32_e32 v188, 0x80, v17
+ ; GCN-NEXT: ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159
+ ; GCN-NEXT: ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147
+ ; GCN-NEXT: ; implicit-def: $vgpr19
+ ; GCN-NEXT: ; implicit-def: $vgpr26
+ ; GCN-NEXT: ; implicit-def: $vgpr27
+ ; GCN-NEXT: v_add_u32_e32 v227, 0xc0, v17
+ ; GCN-NEXT: v_add_u32_e32 v231, v19, v26
+ ; GCN-NEXT: v_add_u32_e32 v232, v19, v27
; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: ; kill: killed $vgpr92
- ; GCN-NEXT: ; implicit-def: $sgpr6
+ ; GCN-NEXT: ; implicit-def: $vgpr28
+ ; GCN-NEXT: ; implicit-def: $vgpr29
+ ; GCN-NEXT: v_add_u32_e32 v233, v19, v28
+ ; GCN-NEXT: v_add_u32_e32 v234, v19, v29
+ ; GCN-NEXT: ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143
+ ; GCN-NEXT: ; implicit-def: $sgpr5
+ ; GCN-NEXT: ; implicit-def: $sgpr7
+ ; GCN-NEXT: ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151
+ ; GCN-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139
+ ; GCN-NEXT: ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135
+ ; GCN-NEXT: ; implicit-def: $vgpr20
+ ; GCN-NEXT: v_add_u32_e32 v18, s17, v20
+ ; GCN-NEXT: v_and_b32_e32 v18, 0x1fffffff, v18
+ ; GCN-NEXT: ; implicit-def: $sgpr16
+ ; GCN-NEXT: v_mul_lo_u32 v18, v18, s16
+ ; GCN-NEXT: ; implicit-def: $vgpr21
+ ; GCN-NEXT: v_add_lshl_u32 v199, v21, v18, 1
+ ; GCN-NEXT: ; implicit-def: $vgpr22
+ ; GCN-NEXT: v_lshl_add_u32 v200, v22, 1, v199
+ ; GCN-NEXT: ; implicit-def: $vgpr23
+ ; GCN-NEXT: v_lshl_add_u32 v201, v23, 1, v200
+ ; GCN-NEXT: ; implicit-def: $vgpr24
+ ; GCN-NEXT: v_lshl_add_u32 v202, v24, 1, v201
+ ; GCN-NEXT: ; implicit-def: $vgpr16
+ ; GCN-NEXT: ; implicit-def: $vgpr18
+ ; GCN-NEXT: ; implicit-def: $vgpr20
+ ; GCN-NEXT: ; implicit-def: $vgpr24
+ ; GCN-NEXT: v_add_u32_e32 v247, v19, v24
+ ; GCN-NEXT: v_add_u32_e32 v248, v19, v16
+ ; GCN-NEXT: v_add_u32_e32 v249, v19, v18
+ ; GCN-NEXT: v_add_u32_e32 v250, v19, v20
+ ; GCN-NEXT: ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131
+ ; GCN-NEXT: ; implicit-def: $sgpr14
+ ; GCN-NEXT: ; implicit-def: $vgpr196
+ ; GCN-NEXT: ; implicit-def: $sgpr12_sgpr13
+ ; GCN-NEXT: ; implicit-def: $vgpr211
+ ; GCN-NEXT: v_max_f32_e32 v212, v211, v211
+ ; GCN-NEXT: ; implicit-def: $vgpr198
+ ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GCN-NEXT: ; implicit-def: $vgpr32
+ ; GCN-NEXT: ; implicit-def: $vgpr33
+ ; GCN-NEXT: ; implicit-def: $vgpr34
+ ; GCN-NEXT: v_add_u32_e32 v210, v19, v34
+ ; GCN-NEXT: v_add_u32_e32 v206, v19, v33
+ ; GCN-NEXT: v_add_u32_e32 v205, v19, v32
+ ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; GCN-NEXT: ; implicit-def: $vgpr21
+ ; GCN-NEXT: ; implicit-def: $vgpr22
+ ; GCN-NEXT: ; implicit-def: $vgpr23
+ ; GCN-NEXT: ; implicit-def: $vgpr30
+ ; GCN-NEXT: ; implicit-def: $vgpr31
+ ; GCN-NEXT: v_add_u32_e32 v207, v19, v21
+ ; GCN-NEXT: v_add_u32_e32 v208, v19, v22
+ ; GCN-NEXT: v_add_u32_e32 v209, v19, v23
+ ; GCN-NEXT: v_add_u32_e32 v203, v19, v30
+ ; GCN-NEXT: v_add_u32_e32 v204, v19, v31
+ ; GCN-NEXT: ; kill: killed $vgpr17
+ ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; GCN-NEXT: ; implicit-def: $vgpr197
+ ; GCN-NEXT: ; iglp_opt mask(0x00000002)
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v95, v[0:3]
+ ; GCN-NEXT: ds_write_b128 v230, v[64:67]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v95, v[4:7] offset:1024
+ ; GCN-NEXT: ds_write_b128 v230, v[68:71] offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN-NEXT: ds_read_b128 v[64:67], v213
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111]
+ ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0
- ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[168:171], v213 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106
+ ; GCN-NEXT: ds_read_b128 v[172:175], v246
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:512
+ ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[84:87], v106 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[180:183], v246 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v95, v[64:67]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_add_u32_e32 v72, 0x80, v93
+ ; GCN-NEXT: ds_write_b128 v230, v[160:163]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024
+ ; GCN-NEXT: ds_write_b128 v230, v[164:167] offset:1024
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:128 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v188, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: ; kill: killed $vgpr72
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN-NEXT: ds_read_b128 v[188:191], v213
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512
+ ; GCN-NEXT: ds_read_b128 v[192:195], v213 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[164:167], v213 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[214:217], v213 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[218:221], v246
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512
+ ; GCN-NEXT: ds_read_b128 v[222:225], v246 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[168:171], v246 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[188:189], v[144:145], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[190:191], v[146:147], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[188:191], v246 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v95, v[64:67]
+ ; GCN-NEXT: ds_write_b128 v230, v[152:155]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024
- ; GCN-NEXT: ; implicit-def: $vgpr64
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
- ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93
- ; GCN-NEXT: ; implicit-def: $vgpr73
- ; GCN-NEXT: v_add_u32_e32 v76, v132, v64
+ ; GCN-NEXT: ds_write_b128 v230, v[160:163] offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79]
+ ; GCN-NEXT: buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; kill: killed $vgpr72
- ; GCN-NEXT: v_add_u32_e32 v72, v132, v73
- ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr74
- ; GCN-NEXT: v_add_u32_e32 v72, v132, v74
- ; GCN-NEXT: ; implicit-def: $vgpr75
- ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v72, v132, v75
- ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79]
+ ; GCN-NEXT: v_perm_b32 v238, v162, v160, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127]
+ ; GCN-NEXT: v_perm_b32 v240, v162, v160, s7
+ ; GCN-NEXT: v_perm_b32 v242, v163, v161, s5
+ ; GCN-NEXT: v_perm_b32 v244, v163, v161, s7
+ ; GCN-NEXT: ds_read_b128 v[160:163], v213
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; kill: killed $vgpr76
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ; implicit-def: $sgpr8
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512
+ ; GCN-NEXT: v_perm_b32 v239, v174, v172, s5
+ ; GCN-NEXT: v_perm_b32 v241, v174, v172, s7
+ ; GCN-NEXT: v_perm_b32 v243, v175, v173, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79]
+ ; GCN-NEXT: v_perm_b32 v245, v175, v173, s7
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[218:221], v213 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[172:175], v213 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[146:147], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[178:179], v[158:159], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[148:149], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[140:141], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[192:193], v[144:145], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[150:151], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[160:163], v213 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106
+ ; GCN-NEXT: ds_read_b128 v[184:187], v246
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512
+ ; GCN-NEXT: ds_read_b128 v[214:217], v246 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[142:143], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[146:147], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[148:149], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[156:157], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[140:141], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[150:151], v[64:79]
+ ; GCN-NEXT: ds_read_b128 v[160:163], v246 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v95, v[64:67]
+ ; GCN-NEXT: ds_write_b128 v230, v[152:155]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024
+ ; GCN-NEXT: ds_write_b128 v230, v[226:229] offset:1024
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[158:159], v[80:95]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[64:67], v94
+ ; GCN-NEXT: ds_read_b128 v[156:159], v213
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[90:93], v94 offset:512
+ ; GCN-NEXT: ds_read_b128 v[226:229], v213 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71
- ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[180:183], v213 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
- ; GCN-NEXT: ds_read_b128 v[76:79], v94 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[152:155], v213 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[94:97], v106
+ ; GCN-NEXT: ds_read_b128 v[230:233], v246
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63]
- ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:512
+ ; GCN-NEXT: ds_read_b128 v[234:237], v246 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:1024
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[224:225], v[142:143], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[148:149], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[156:159], v246 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63]
- ; GCN-NEXT: v_perm_b32 v94, v102, v98, s5
- ; GCN-NEXT: v_perm_b32 v98, v102, v98, s8
- ; GCN-NEXT: v_perm_b32 v102, v103, v99, s5
- ; GCN-NEXT: v_perm_b32 v95, v104, v100, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63]
- ; GCN-NEXT: v_perm_b32 v96, v103, v99, s8
- ; GCN-NEXT: v_perm_b32 v99, v104, v100, s8
- ; GCN-NEXT: v_perm_b32 v103, v105, v101, s5
- ; GCN-NEXT: v_perm_b32 v97, v105, v101, s8
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47]
- ; GCN-NEXT: s_nop 5
- ; GCN-NEXT: v_mul_f32_e32 v100, s4, v48
- ; GCN-NEXT: v_mul_f32_e32 v101, s4, v49
- ; GCN-NEXT: v_max3_f32 v92, v100, s6, v101
- ; GCN-NEXT: v_mul_f32_e32 v93, s4, v50
- ; GCN-NEXT: v_mul_f32_e32 v100, s4, v51
- ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100
- ; GCN-NEXT: v_mul_f32_e32 v93, s4, v52
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31]
- ; GCN-NEXT: v_mul_f32_e32 v100, s4, v53
- ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100
- ; GCN-NEXT: v_mul_f32_e32 v84, s4, v54
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v55
- ; GCN-NEXT: v_max3_f32 v84, v92, v84, v85
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v56
- ; GCN-NEXT: v_mul_f32_e32 v92, s4, v57
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15]
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v92
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v58
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v59
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v60
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v61
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47]
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v62
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v63
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: ; implicit-def: $sgpr6
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31]
- ; GCN-NEXT: s_nop 6
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v32
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v33
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v34
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v35
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v36
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15]
- ; GCN-NEXT: v_mul_f32_e32 v86, s4, v37
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v38
- ; GCN-NEXT: v_mul_f32_e32 v86, s4, v39
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v40
- ; GCN-NEXT: v_mul_f32_e32 v80, s4, v41
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31]
- ; GCN-NEXT: v_max3_f32 v80, v84, v85, v80
- ; GCN-NEXT: v_mul_f32_e32 v81, s4, v42
- ; GCN-NEXT: v_mul_f32_e32 v84, s4, v43
- ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84
- ; GCN-NEXT: v_mul_f32_e32 v81, s4, v44
- ; GCN-NEXT: v_mul_f32_e32 v84, s4, v45
- ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15]
- ; GCN-NEXT: v_mul_f32_e32 v81, s4, v46
- ; GCN-NEXT: v_mul_f32_e32 v82, s4, v47
- ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82
- ; GCN-NEXT: v_mul_f32_e32 v81, s4, v16
- ; GCN-NEXT: v_mul_f32_e32 v82, s4, v17
- ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82
- ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15]
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19
- ; GCN-NEXT: v_max3_f32 v68, v80, v68, v69
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v20
- ; GCN-NEXT: v_mul_f32_e32 v76, s4, v21
- ; GCN-NEXT: v_max3_f32 v68, v68, v69, v76
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v22
- ; GCN-NEXT: v_mul_f32_e32 v70, s4, v23
- ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v24
- ; GCN-NEXT: v_mul_f32_e32 v70, s4, v25
- ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v26
- ; GCN-NEXT: v_mul_f32_e32 v70, s4, v27
- ; GCN-NEXT: v_max3_f32 v64, v68, v69, v70
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v28
- ; GCN-NEXT: v_mul_f32_e32 v68, s4, v29
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v30
- ; GCN-NEXT: v_mul_f32_e32 v68, s4, v31
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v0
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v1
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v2
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v3
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v4
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v5
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v6
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v7
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v8
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v9
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v10
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v11
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v12
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v13
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v14
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v15
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: ; implicit-def: $vgpr65
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: ; implicit-def: $vgpr68
- ; GCN-NEXT: ; implicit-def: $vgpr67
- ; GCN-NEXT: v_add_u32_e32 v65, s7, v65
- ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65
- ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6
- ; GCN-NEXT: v_add_lshl_u32 v135, v66, v65, 1
- ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: v_lshl_add_u32 v136, v66, 1, v135
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: v_lshl_add_u32 v137, v66, 1, v136
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7
- ; GCN-NEXT: v_lshl_add_u32 v138, v66, 1, v137
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v135, v[94:95]
- ; GCN-NEXT: v_max_f32_e32 v65, v65, v65
- ; GCN-NEXT: v_max_f32_e32 v64, v64, v65
- ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64
+ ; GCN-NEXT: ds_write_b64 v199, v[238:239]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v136, v[98:99]
+ ; GCN-NEXT: ds_write_b64 v200, v[240:241]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v137, v[102:103]
+ ; GCN-NEXT: ds_write_b64 v201, v[242:243]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v138, v[96:97]
- ; GCN-NEXT: v_add_u32_e32 v68, v132, v68
- ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7]
- ; GCN-NEXT: v_max_f32_e32 v64, v64, v64
- ; GCN-NEXT: ; implicit-def: $vgpr65
- ; GCN-NEXT: v_max_f32_e32 v66, v65, v65
- ; GCN-NEXT: v_max_f32_e32 v134, v66, v64
- ; GCN-NEXT: ; implicit-def: $vgpr64
+ ; GCN-NEXT: ds_write_b64 v202, v[244:245]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v64, v132, v64
- ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[150:151], v[96:111]
+ ; GCN-NEXT: buffer_load_dwordx2 v[194:195], v248, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: v_add_u32_e32 v64, v132, v66
- ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[218:219], v249, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v64, v132, v67
- ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[220:221], v250, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134
- ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134
- ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134
- ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
- ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134
- ; GCN-NEXT: v_exp_f32_e32 v163, v57
- ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96
- ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134
- ; GCN-NEXT: v_exp_f32_e32 v164, v57
- ; GCN-NEXT: v_exp_f32_e32 v49, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64
- ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134
- ; GCN-NEXT: v_exp_f32_e32 v50, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v66
- ; GCN-NEXT: v_fma_f32 v68, s4, v52, -v134
- ; GCN-NEXT: v_exp_f32_e32 v51, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v67
- ; GCN-NEXT: v_fma_f32 v69, s4, v53, -v134
- ; GCN-NEXT: v_exp_f32_e32 v52, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v68
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_fma_f32 v70, s4, v54, -v134
- ; GCN-NEXT: v_exp_f32_e32 v53, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v69
- ; GCN-NEXT: v_fma_f32 v71, s4, v55, -v134
- ; GCN-NEXT: ds_read_b128 v[140:143], v139
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v54, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70
- ; GCN-NEXT: v_exp_f32_e32 v55, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71
- ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134
- ; GCN-NEXT: v_exp_f32_e32 v56, v48
- ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49
- ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50
- ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51
- ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
- ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v48, v48
- ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58
- ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67
- ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66
- ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55
- ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56
- ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
- ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134
- ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
- ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48
- ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48
- ; GCN-NEXT: v_mul_f32_e64 v84, v84, v48
- ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48
- ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48
- ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48
- ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
- ; GCN-NEXT: v_exp_f32_e32 v58, v58
- ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
- ; GCN-NEXT: v_mul_f32_e64 v98, v98, v48
- ; GCN-NEXT: v_mul_f32_e64 v99, v99, v48
- ; GCN-NEXT: v_mul_f32_e64 v100, v100, v48
- ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48
- ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48
- ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48
- ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57
- ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59
- ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53
- ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54
- ; GCN-NEXT: v_exp_f32_e32 v59, v57
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
- ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134
- ; GCN-NEXT: v_mul_f32_e64 v112, v112, v48
- ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48
- ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48
- ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48
- ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134
- ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
- ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134
- ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60
- ; GCN-NEXT: ; implicit-def: $vgpr57
- ; GCN-NEXT: ds_read_b128 v[60:63], v57
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v160, v149
- ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134
- ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148
- ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
- ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134
- ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134
- ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134
- ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134
- ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134
- ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
- ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162
- ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163
- ; GCN-NEXT: v_exp_f32_e32 v162, v146
- ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164
- ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134
- ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147
- ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
- ; GCN-NEXT: v_exp_f32_e32 v151, v33
- ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59
- ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134
- ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134
- ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134
- ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33
- ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
- ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134
- ; GCN-NEXT: v_exp_f32_e32 v153, v33
- ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134
- ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5
- ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32
- ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161
- ; GCN-NEXT: v_exp_f32_e32 v165, v60
- ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8
- ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v161, v61
- ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8
- ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5
- ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8
- ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8
+ ; GCN-NEXT: v_perm_b32 v188, v194, v192, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[164:165], v[144:145], v[80:95]
+ ; GCN-NEXT: v_perm_b32 v189, v220, v218, s5
+ ; GCN-NEXT: v_perm_b32 v191, v220, v218, s7
+ ; GCN-NEXT: v_perm_b32 v190, v194, v192, s7
+ ; GCN-NEXT: v_perm_b32 v192, v195, v193, s5
+ ; GCN-NEXT: v_perm_b32 v194, v195, v193, s7
+ ; GCN-NEXT: v_perm_b32 v193, v221, v219, s5
+ ; GCN-NEXT: v_perm_b32 v195, v221, v219, s7
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[136:137], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[150:151], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[138:139], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[176:177], v[136:137], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[226:227], v[132:133], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[178:179], v[138:139], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[136:137], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[128:129], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[134:135], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[132:133], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[138:139], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[232:233], v[130:131], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[234:235], v[128:129], v[96:111]
+ ; GCN-NEXT: s_nop 9
+ ; GCN-NEXT: v_mul_f32_e32 v213, s4, v112
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v113
+ ; GCN-NEXT: v_max3_f32 v213, v213, s14, v218
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v114
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v115
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v116
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95]
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v117
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v118
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v119
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v120
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v121
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79]
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v122
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v123
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v124
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v125
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111]
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v126
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v127
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95]
+ ; GCN-NEXT: s_nop 6
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v96
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v97
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v98
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v99
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v100
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v101
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v102
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v103
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v104
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v105
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95]
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v106
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v107
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v108
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v109
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v110
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v111
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v140, s4, v80
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v81
+ ; GCN-NEXT: v_max3_f32 v140, v213, v140, v141
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v82
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v83
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v84
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v85
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v86
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v87
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v88
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v89
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v90
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v91
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v92
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v93
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v94
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v95
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v128, s4, v64
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v65
+ ; GCN-NEXT: v_max3_f32 v128, v140, v128, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v66
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v67
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v68
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v69
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v70
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v71
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v72
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v73
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v74
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v75
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v76
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v77
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v78
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v79
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[130:133], v198
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_max_f32_e32 v129, v129, v129
+ ; GCN-NEXT: v_max_f32_e32 v128, v128, v129
+ ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: v_cndmask_b32_e64 v128, v129, v128, s[12:13]
+ ; GCN-NEXT: v_max_f32_e32 v128, v128, v128
+ ; GCN-NEXT: v_max_f32_e32 v128, v212, v128
+ ; GCN-NEXT: v_fma_f32 v113, s4, v113, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v114, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v115, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v116, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v117, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v118, -v128
+ ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v119, -v128
+ ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128
+ ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112
+ ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120
+ ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v114, v138
+ ; GCN-NEXT: v_exp_f32_e32 v115, v139
+ ; GCN-NEXT: v_exp_f32_e32 v116, v140
+ ; GCN-NEXT: v_exp_f32_e32 v117, v141
+ ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v118
+ ; GCN-NEXT: v_exp_f32_e32 v118, v142
+ ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v120
+ ; GCN-NEXT: v_exp_f32_e32 v120, v144
+ ; GCN-NEXT: v_exp_f32_e32 v113, v112
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116
+ ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129
+ ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v122, s4, v123, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115
+ ; GCN-NEXT: v_mul_f32_e32 v151, 0x3fb8aa3b, v122
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117
+ ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121
+ ; GCN-NEXT: v_exp_f32_e32 v112, v129
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122
+ ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128
+ ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v119, v143
+ ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47]
+ ; GCN-NEXT: v_mul_f32_e64 v20, v20, v112
+ ; GCN-NEXT: v_mul_f32_e64 v21, v21, v112
+ ; GCN-NEXT: v_mul_f32_e64 v22, v22, v112
+ ; GCN-NEXT: v_mul_f32_e64 v23, v23, v112
+ ; GCN-NEXT: v_mul_f32_e64 v24, v24, v112
+ ; GCN-NEXT: v_mul_f32_e64 v25, v25, v112
+ ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pack_b32_f16 v134, v123, v124
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v119
+ ; GCN-NEXT: v_fma_f32 v124, s4, v126, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v120
+ ; GCN-NEXT: v_exp_f32_e32 v121, v148
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v122, v149
+ ; GCN-NEXT: v_pack_b32_f16 v135, v130, v126
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v124
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v121
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125
+ ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128
+ ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v123, v150
+ ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127
+ ; GCN-NEXT: v_fma_f32 v143, s4, v101, -v128
+ ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128
+ ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128
+ ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128
+ ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v124, v151
+ ; GCN-NEXT: ds_read_b128 v[130:133], v197
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v122
+ ; GCN-NEXT: v_exp_f32_e32 v96, v129
+ ; GCN-NEXT: v_fma_f32 v137, s4, v97, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139
+ ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v97, v125
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v137
+ ; GCN-NEXT: v_fma_f32 v137, s4, v98, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v137
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v124
+ ; GCN-NEXT: v_fma_f32 v135, s4, v99, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v98, v138
+ ; GCN-NEXT: v_exp_f32_e32 v99, v127
+ ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_pack_b32_f16 v127, v136, v134
+ ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15]
+ ; GCN-NEXT: v_fma_f32 v131, s4, v100, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v96
+ ; GCN-NEXT: v_exp_f32_e32 v100, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v97
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b64 v135, v[36:37]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111]
- ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5
- ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150
- ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151
- ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153
- ; GCN-NEXT: v_exp_f32_e32 v159, v33
- ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38
- ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152
- ; GCN-NEXT: v_exp_f32_e32 v152, v38
+ ; GCN-NEXT: ds_write_b64 v199, v[188:189]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v136, v[60:61]
+ ; GCN-NEXT: ds_write_b64 v200, v[190:191]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v137, v[32:33]
- ; GCN-NEXT: ; implicit-def: $vgpr33
- ; GCN-NEXT: ; implicit-def: $vgpr38
+ ; GCN-NEXT: ds_write_b64 v201, v[192:193]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v138, v[140:141]
- ; GCN-NEXT: v_add_u32_e32 v38, v132, v38
- ; GCN-NEXT: v_add_u32_e32 v33, v132, v33
+ ; GCN-NEXT: ds_write_b64 v202, v[194:195]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v101, v125
+ ; GCN-NEXT: v_pack_b32_f16 v146, v130, v131
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v143
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v98
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31]
+ ; GCN-NEXT: v_fma_f32 v134, s4, v102, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v134
+ ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; implicit-def: $vgpr36
- ; GCN-NEXT: v_add_u32_e32 v33, v132, v36
- ; GCN-NEXT: ; implicit-def: $vgpr37
- ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v102, v142
+ ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v33, v132, v37
- ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165
- ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156
- ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134
- ; GCN-NEXT: ds_read_b128 v[36:39], v139
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79]
- ; GCN-NEXT: v_exp_f32_e32 v154, v32
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158
- ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134
- ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v155, v32
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157
- ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161
- ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159
- ; GCN-NEXT: v_exp_f32_e32 v157, v32
- ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150
- ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142
- ; GCN-NEXT: v_exp_f32_e32 v146, v32
- ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134
- ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40
- ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v147, v36
- ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143
- ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v143, v36
- ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155
- ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142
- ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111]
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156
- ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157
- ; GCN-NEXT: v_exp_f32_e32 v156, v32
- ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146
- ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32
- ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
- ; GCN-NEXT: v_exp_f32_e32 v129, v36
- ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44
- ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147
- ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
- ; GCN-NEXT: ds_read_b128 v[36:39], v57
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v142, v40
- ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61
- ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143
- ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95]
- ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134
- ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150
- ; GCN-NEXT: v_exp_f32_e32 v63, v40
- ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61
- ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134
- ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156
- ; GCN-NEXT: v_exp_f32_e32 v158, v17
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129
- ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17
- ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
- ; GCN-NEXT: v_exp_f32_e32 v128, v17
- ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8
- ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16
- ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62
- ; GCN-NEXT: v_exp_f32_e32 v167, v36
- ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8
- ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v130, v37
- ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158
- ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5
- ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8
- ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v99
+ ; GCN-NEXT: v_fma_f32 v127, s4, v103, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v103, v150
+ ; GCN-NEXT: v_fma_f32 v139, s4, v105, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v147, v147, v126
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v127
+ ; GCN-NEXT: v_perm_b32 v152, v135, v131, s5
+ ; GCN-NEXT: v_perm_b32 v154, v135, v131, s7
+ ; GCN-NEXT: v_fma_f32 v135, s4, v104, -v128
+ ; GCN-NEXT: v_perm_b32 v126, v134, v130, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15]
+ ; GCN-NEXT: v_perm_b32 v150, v134, v130, s7
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v100
+ ; GCN-NEXT: v_exp_f32_e32 v104, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v101
+ ; GCN-NEXT: ds_read_b128 v[130:133], v198
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_perm_b32 v127, v144, v142, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47]
+ ; GCN-NEXT: v_pack_b32_f16 v148, v134, v135
+ ; GCN-NEXT: v_fma_f32 v135, s4, v106, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v105, v125
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v102
+ ; GCN-NEXT: v_perm_b32 v151, v144, v142, s7
+ ; GCN-NEXT: v_perm_b32 v153, v145, v143, s5
+ ; GCN-NEXT: v_perm_b32 v155, v145, v143, s7
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v106, v156
+ ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v103
+ ; GCN-NEXT: v_fma_f32 v136, s4, v107, -v128
+ ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v139
+ ; GCN-NEXT: v_pack_b32_f16 v149, v134, v135
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63]
+ ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v136
+ ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v107, v138
+ ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[148:149], v[0:15]
+ ; GCN-NEXT: v_fma_f32 v131, s4, v108, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v104
+ ; GCN-NEXT: v_exp_f32_e32 v108, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v105
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47]
+ ; GCN-NEXT: v_fma_f32 v142, s4, v109, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v109, v125
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v142
+ ; GCN-NEXT: v_pack_b32_f16 v142, v130, v131
+ ; GCN-NEXT: v_fma_f32 v131, s4, v110, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v106
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31]
+ ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v107
+ ; GCN-NEXT: v_exp_f32_e32 v110, v156
+ ; GCN-NEXT: v_fma_f32 v135, s4, v111, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_pack_b32_f16 v143, v130, v131
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v111, v146
+ ; GCN-NEXT: v_fma_f32 v139, s4, v80, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v108
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v80, v129
+ ; GCN-NEXT: ds_read_b128 v[130:133], v197
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v109
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47]
+ ; GCN-NEXT: v_fma_f32 v144, s4, v81, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v81, v125
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v144
+ ; GCN-NEXT: v_pack_b32_f16 v144, v138, v139
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v110
+ ; GCN-NEXT: v_fma_f32 v137, s4, v82, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v82, v134
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v111
+ ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v137
+ ; GCN-NEXT: v_fma_f32 v137, s4, v83, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v157, 0x3fb8aa3b, v137
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v83, v135
+ ; GCN-NEXT: v_pack_b32_f16 v145, v136, v134
+ ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b64 v135, v[20:21]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
- ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5
- ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128
- ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150
- ; GCN-NEXT: v_exp_f32_e32 v140, v17
- ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5
+ ; GCN-NEXT: ds_write_b64 v199, v[126:127]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v136, v[36:37]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22
- ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60
- ; GCN-NEXT: v_exp_f32_e32 v144, v22
+ ; GCN-NEXT: ds_write_b64 v200, v[150:151]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v137, v[16:17]
- ; GCN-NEXT: ; implicit-def: $vgpr17
- ; GCN-NEXT: ; implicit-def: $vgpr22
+ ; GCN-NEXT: ds_write_b64 v201, v[152:153]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v138, v[42:43]
- ; GCN-NEXT: v_add_u32_e32 v22, v132, v22
- ; GCN-NEXT: v_add_u32_e32 v17, v132, v17
- ; GCN-NEXT: ; implicit-def: $vgpr20
- ; GCN-NEXT: ; implicit-def: $vgpr21
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: ds_write_b64 v202, v[154:155]
+ ; GCN-NEXT: v_fma_f32 v127, s4, v84, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v84, v129
+ ; GCN-NEXT: v_fma_f32 v130, s4, v85, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v80
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v127
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v85, v125
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v130
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v81
+ ; GCN-NEXT: v_pack_b32_f16 v126, v126, v127
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31]
+ ; GCN-NEXT: v_fma_f32 v134, s4, v86, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v134
+ ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v203, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v20, v132, v20
- ; GCN-NEXT: v_add_u32_e32 v21, v132, v21
- ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44
- ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
- ; GCN-NEXT: v_exp_f32_e32 v132, v16
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v82
+ ; GCN-NEXT: v_exp_f32_e32 v86, v156
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v83
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167
- ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134
- ; GCN-NEXT: ds_read_b128 v[20:23], v139
+ ; GCN-NEXT: v_fma_f32 v139, s4, v87, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v87, v157
+ ; GCN-NEXT: v_pack_b32_f16 v127, v127, v138
+ ; GCN-NEXT: v_fma_f32 v138, s4, v89, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v139
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15]
+ ; GCN-NEXT: ; implicit-def: $sgpr0
+ ; GCN-NEXT: v_perm_b32 v154, v135, v131, s5
+ ; GCN-NEXT: v_perm_b32 v156, v135, v131, s7
+ ; GCN-NEXT: v_fma_f32 v135, s4, v88, -v128
+ ; GCN-NEXT: v_perm_b32 v150, v134, v130, s5
+ ; GCN-NEXT: v_perm_b32 v152, v134, v130, s7
+ ; GCN-NEXT: ds_read_b128 v[130:133], v198
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v84
+ ; GCN-NEXT: v_exp_f32_e32 v88, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v85
+ ; GCN-NEXT: v_perm_b32 v151, v146, v142, s5
+ ; GCN-NEXT: v_perm_b32 v153, v146, v142, s7
+ ; GCN-NEXT: v_perm_b32 v155, v147, v143, s5
+ ; GCN-NEXT: v_perm_b32 v157, v147, v143, s7
+ ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v89, v125
+ ; GCN-NEXT: v_pack_b32_f16 v146, v134, v135
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v86
+ ; GCN-NEXT: v_fma_f32 v135, s4, v90, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v138
+ ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v90, v158
+ ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v64
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v87
+ ; GCN-NEXT: v_fma_f32 v127, s4, v91, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v91, v139
+ ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127
+ ; GCN-NEXT: v_pack_b32_f16 v147, v134, v126
+ ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
+ ; GCN-NEXT: v_fma_f32 v130, s4, v92, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v88
+ ; GCN-NEXT: v_exp_f32_e32 v92, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v130
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v89
+ ; GCN-NEXT: v_fma_f32 v131, s4, v93, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v130, v126, v130
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v93, v125
+ ; GCN-NEXT: v_fma_f32 v126, s4, v94, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v125, v90
+ ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v126
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v91
+ ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_fma_f32 v131, s4, v95, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v94, v148
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v95, v127
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v92
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_pack_b32_f16 v131, v125, v126
+ ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v125, v129
+ ; GCN-NEXT: ds_read_b128 v[132:135], v197
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[36:39], v139 offset:576
+ ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47]
+ ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_fma_f32 v65, s4, v66, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v126, v142
+ ; GCN-NEXT: v_pack_b32_f16 v142, v127, v64
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v94
+ ; GCN-NEXT: v_mul_f32_e32 v145, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v95
+ ; GCN-NEXT: v_fma_f32 v66, s4, v67, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v127, v143
+ ; GCN-NEXT: v_pack_b32_f16 v143, v64, v65
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v129, v138
+ ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v66
+ ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[136:139], v197 offset:1728
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v62, v16
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130
- ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134
- ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134
- ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134
- ; GCN-NEXT: ; implicit-def: $sgpr0
- ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140
- ; GCN-NEXT: v_exp_f32_e32 v145, v16
- ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141
- ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46
- ; GCN-NEXT: v_exp_f32_e32 v35, v16
- ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v34, s4, v27, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v24
- ; GCN-NEXT: ds_read_b128 v[24:27], v139 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v46, v20
- ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v47
- ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v132
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v47, v20
- ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v62
- ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v34
- ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111]
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145
- ; GCN-NEXT: v_exp_f32_e32 v141, v16
- ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35
- ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134
- ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16
- ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127]
- ; GCN-NEXT: v_exp_f32_e32 v33, v20
- ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28
- ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
- ; GCN-NEXT: ds_read_b128 v[20:23], v57
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v36, v24
- ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37
- ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47
- ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
- ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_exp_f32_e32 v39, v24
- ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37
- ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141
- ; GCN-NEXT: v_exp_f32_e32 v148, v1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33
- ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127]
- ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134
- ; GCN-NEXT: v_exp_f32_e32 v34, v1
- ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8
- ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0
- ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38
- ; GCN-NEXT: v_exp_f32_e32 v150, v20
- ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8
- ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v38, v21
- ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39
- ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134
- ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5
- ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b64 v135, v[4:5]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111]
- ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5
- ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_exp_f32_e32 v42, v1
- ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5
+ ; GCN-NEXT: ds_write_b64 v199, v[150:151]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v136, v[20:21]
+ ; GCN-NEXT: ds_write_b64 v200, v[152:153]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v125
+ ; GCN-NEXT: v_exp_f32_e32 v130, v158
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v137, v[0:1]
+ ; GCN-NEXT: ds_write_b64 v201, v[154:155]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v138, v[26:27]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6
- ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32
+ ; GCN-NEXT: ds_write_b64 v202, v[156:157]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_pack_b32_f16 v16, v37, v28
- ; GCN-NEXT: v_fma_f32 v24, s4, v7, -v134
- ; GCN-NEXT: v_exp_f32_e32 v25, v6
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[4:7], v139
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149
- ; GCN-NEXT: v_exp_f32_e32 v26, v0
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150
- ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38
- ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v28, s4, v9, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v29, v0
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v41
- ; GCN-NEXT: v_fma_f32 v30, s4, v10, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v42
- ; GCN-NEXT: v_exp_f32_e32 v31, v0
- ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v17, v2, v0
- ; GCN-NEXT: v_pack_b32_f16 v16, v1, v27
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24
- ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134
- ; GCN-NEXT: v_exp_f32_e32 v19, v0
- ; GCN-NEXT: ds_read_b128 v[0:3], v139 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8
- ; GCN-NEXT: ds_read_b128 v[8:11], v139 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v24, v4
- ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v28
- ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v26
- ; GCN-NEXT: v_exp_f32_e32 v27, v4
- ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29
- ; GCN-NEXT: v_fma_f32 v21, s4, v13, -v134
- ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111]
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31
- ; GCN-NEXT: v_exp_f32_e32 v30, v0
- ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19
- ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
- ; GCN-NEXT: v_exp_f32_e32 v16, v4
- ; GCN-NEXT: v_pack_b32_f16 v0, v5, v20
- ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v12
- ; GCN-NEXT: v_exp_f32_e32 v18, v9
- ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21
- ; GCN-NEXT: v_exp_f32_e32 v21, v9
- ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
- ; GCN-NEXT: ds_read_b128 v[4:7], v57
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24
- ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21
- ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30
- ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28
- ; GCN-NEXT: v_exp_f32_e32 v2, v2
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v8
- ; GCN-NEXT: v_exp_f32_e32 v10, v1
- ; GCN-NEXT: v_pack_b32_f16 v8, v17, v20
- ; GCN-NEXT: v_pack_b32_f16 v9, v3, v0
- ; GCN-NEXT: v_add_f32_e32 v3, 0, v49
- ; GCN-NEXT: v_add_f32_e32 v3, v50, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v51, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v52, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v53, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v54, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v55, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v56, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v58, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v163, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v164, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v59, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v160, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v162, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v151, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v153, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v165, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v161, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v159, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v152, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v154, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v155, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v157, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v146, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v147, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v143, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v156, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v129, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v142, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v63, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v158, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v128, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v167, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v130, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v140, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v144, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v132, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v62, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v145, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v35, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v46, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v47, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v141, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v33, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v36, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v39, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v148, v3
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
- ; GCN-NEXT: v_add_f32_e32 v3, v34, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v150, v3
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10
- ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2
- ; GCN-NEXT: v_add_f32_e32 v3, v38, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v42, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v25, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v26, v3
- ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1
- ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22
- ; GCN-NEXT: v_add_f32_e32 v3, v29, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v31, v3
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
- ; GCN-NEXT: v_add_f32_e32 v3, v19, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v24, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v27, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v30, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v16, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v18, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v21, v3
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
- ; GCN-NEXT: v_add_f32_e32 v0, v2, v3
- ; GCN-NEXT: v_add_f32_e32 v4, v10, v0
- ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47]
+ ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v126
+ ; GCN-NEXT: v_exp_f32_e32 v131, v144
+ ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v69
+ ; GCN-NEXT: v_fma_f32 v69, s4, v71, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v140, v132, v68
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v129
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v127
+ ; GCN-NEXT: v_exp_f32_e32 v132, v145
+ ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_fma_f32 v145, s4, v73, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v145
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v133, v141
+ ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v69
+ ; GCN-NEXT: v_pack_b32_f16 v141, v64, v68
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[68:71], v198
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v143, s4, v72, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v130
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v72, v146
+ ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v143
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v143, v131
+ ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_pack_b32_f16 v64, v64, v143
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v73, v144
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v132
+ ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v74, v65
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v133
+ ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+ ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63]
+ ; GCN-NEXT: v_fma_f32 v138, s4, v75, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v75, v142
+ ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v138
+ ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v72
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
+ ; GCN-NEXT: v_fma_f32 v68, s4, v76, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v76, v146
+ ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v73
+ ; GCN-NEXT: v_fma_f32 v69, s4, v77, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v77, v147
+ ; GCN-NEXT: v_pack_b32_f16 v134, v66, v68
+ ; GCN-NEXT: v_fma_f32 v68, s4, v78, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v74
+ ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v69
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v78, v67
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v68
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v76
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v75
+ ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v79, v148
+ ; GCN-NEXT: v_mul_f32_e32 v128, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_pack_b32_f16 v135, v66, v64
+ ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v142, v146
+ ; GCN-NEXT: ds_read_b128 v[68:71], v197
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v137, v147
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v77
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v138, v138
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v78
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
+ ; GCN-NEXT: s_nop 10
+ ; GCN-NEXT: v_exp_f32_e32 v52, v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v137
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v142
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v138
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v52
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v79
+ ; GCN-NEXT: v_pack_b32_f16 v50, v51, v50
+ ; GCN-NEXT: v_pack_b32_f16 v48, v139, v136
+ ; GCN-NEXT: v_pack_b32_f16 v51, v54, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, 0, v113
+ ; GCN-NEXT: v_add_f32_e32 v53, v114, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v115, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v116, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v117, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v118, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v119, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v120, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v121, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v122, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v123, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v124, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v96, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v97, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v98, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v99, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v100, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v101, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v102, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v103, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v104, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v105, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v106, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v107, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v108, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v109, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v110, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v111, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v80, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v81, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v82, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v83, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v84, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v85, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v86, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v87, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v88, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v89, v53
+ ; GCN-NEXT: v_pack_b32_f16 v49, v140, v49
+ ; GCN-NEXT: v_add_f32_e32 v53, v90, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v91, v53
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v53, v92, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v93, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v94, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v95, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v125, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v126, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v127, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v129, v53
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47]
+ ; GCN-NEXT: s_nop 9
+ ; GCN-NEXT: v_add_f32_e32 v0, v130, v53
+ ; GCN-NEXT: v_add_f32_e32 v0, v131, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v132, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v133, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v72, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v73, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v74, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v75, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v76, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v77, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v78, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v79, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v142, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v137, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v138, v0
+ ; GCN-NEXT: v_add_f32_e32 v4, v52, v0
+ ; GCN-NEXT: ds_bpermute_b32 v5, v196, v4
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31]
; GCN-NEXT: v_add_f32_e32 v2, v4, v5
- ; GCN-NEXT: ds_bpermute_b32 v3, v133, v2
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111]
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7]
+ ; GCN-NEXT: ds_bpermute_b32 v3, v196, v2
; GCN-NEXT: ; implicit-def: $vgpr4
- ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48
- ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[12:13]
+ ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v112
+ ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1728
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47]
; GCN-NEXT: s_endpgm
attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 7959cee..e174fc1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -156,62 +156,62 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-NEXT: v_mov_b32_e32 v2, 1.0
-; GCN-NEXT: v_mov_b32_e32 v3, 2.0
+; GCN-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_add_u32_e32 v1, s0, v0
-; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112
-; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96
-; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80
-; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64
-; GCN-NEXT: ds_read_b128 a[0:3], v1
-; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16
-; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32
-; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48
+; GCN-NEXT: v_add_u32_e32 v3, s0, v0
+; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:112
+; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:96
+; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:80
+; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:64
+; GCN-NEXT: ds_read_b128 a[0:3], v3
+; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:16
+; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:32
+; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:48
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304
-; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288
-; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272
-; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256
-; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240
-; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224
-; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208
-; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:8304
+; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:8288
+; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:8272
+; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:8256
+; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:8240
+; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:8224
+; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:8208
+; GCN-NEXT: ds_read_b128 a[128:131], v3 offset:8192
+; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
; GCN-NEXT: ; iglp_opt mask(0x00000001)
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
-; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688
-; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672
-; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656
-; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640
-; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624
-; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608
-; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592
-; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
+; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688
+; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672
+; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656
+; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640
+; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624
+; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608
+; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592
+; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
-; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264
-; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248
-; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232
-; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216
-; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200
-; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184
-; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168
-; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152
-; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264
+; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248
+; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232
+; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216
+; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200
+; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184
+; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168
+; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
-; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456
-; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440
-; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424
-; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408
-; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344
-; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360
-; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376
-; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456
+; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440
+; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424
+; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408
+; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344
+; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360
+; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376
+; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index aa099b6..b65a1a8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -623,62 +623,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 1.0
+; GCN-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_add_u32_e32 v1, s0, v0
-; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112
-; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96
-; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80
-; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64
-; GCN-NEXT: ds_read_b128 a[128:131], v1
-; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16
-; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32
-; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48
-; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304
-; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288
-; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272
-; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256
-; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240
-; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224
-; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208
-; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192
-; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1
-; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688
-; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672
-; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656
-; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640
-; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624
-; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608
-; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592
-; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576
-; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264
-; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248
-; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232
-; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216
-; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200
-; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184
-; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168
-; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152
-; GCN-NEXT: v_mov_b32_e32 v1, 1.0
-; GCN-NEXT: ds_read_b128 a[60:63], v2 offset:57456
-; GCN-NEXT: ds_read_b128 a[56:59], v2 offset:57440
-; GCN-NEXT: ds_read_b128 a[52:55], v2 offset:57424
-; GCN-NEXT: ds_read_b128 a[48:51], v2 offset:57408
-; GCN-NEXT: ds_read_b128 a[32:35], v2 offset:57344
-; GCN-NEXT: ds_read_b128 a[36:39], v2 offset:57360
-; GCN-NEXT: ds_read_b128 a[40:43], v2 offset:57376
-; GCN-NEXT: ds_read_b128 a[44:47], v2 offset:57392
-; GCN-NEXT: v_mov_b32_e32 v2, 2.0
+; GCN-NEXT: v_add_u32_e32 v3, s0, v0
+; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:112
+; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:96
+; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:80
+; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:64
+; GCN-NEXT: ds_read_b128 a[128:131], v3
+; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:16
+; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32
+; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48
+; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3
+; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:8304
+; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:8288
+; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:8272
+; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:8256
+; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:8240
+; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:8224
+; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:8208
+; GCN-NEXT: ds_read_b128 a[0:3], v3 offset:8192
+; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688
+; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672
+; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656
+; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640
+; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624
+; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608
+; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592
+; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576
+; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264
+; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248
+; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232
+; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216
+; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200
+; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184
+; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168
+; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152
+; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456
+; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440
+; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424
+; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408
+; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344
+; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360
+; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376
+; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392
+; GCN-NEXT: s_waitcnt lgkmcnt(14)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
-; GCN-NEXT: s_waitcnt lgkmcnt(14)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
; GCN-NEXT: s_waitcnt lgkmcnt(8)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; GCN-NEXT: s_nop 12
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
+; GCN-NEXT: s_nop 11
; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112
; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96
; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80
@@ -729,62 +729,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 2.0
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0
-; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112
-; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96
-; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80
-; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64
-; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1
-; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16
-; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32
-; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192
-; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1
-; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:24688
-; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:24672
-; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:24656
-; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:24640
-; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:24624
-; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:24608
-; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:24592
-; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:24576
-; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:49264
-; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:49248
-; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:49232
-; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:49216
-; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:49200
-; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:49184
-; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:49168
-; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:49152
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 1.0
-; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v2 offset:57456
-; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v2 offset:57440
-; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v2 offset:57424
-; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v2 offset:57408
-; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v2 offset:57344
-; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v2 offset:57360
-; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v2 offset:57376
-; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v2 offset:57392
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0
+; EXACTCUTOFF-NEXT: v_add_u32_e32 v3, s0, v0
+; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v3 offset:112
+; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v3 offset:96
+; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v3 offset:80
+; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v3 offset:64
+; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v3
+; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v3 offset:16
+; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v3 offset:32
+; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v3 offset:48
+; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v3
+; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v3 offset:8304
+; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v3 offset:8288
+; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v3 offset:8272
+; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v3 offset:8256
+; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v3 offset:8240
+; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v3 offset:8224
+; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v3 offset:8208
+; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v3 offset:8192
+; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:24688
+; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:24672
+; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:24656
+; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:24640
+; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:24624
+; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:24608
+; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:24592
+; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:24576
+; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:49264
+; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:49248
+; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:49232
+; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v3 offset:49216
+; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v3 offset:49200
+; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:49184
+; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:49168
+; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:49152
+; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v4 offset:57456
+; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v4 offset:57440
+; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v4 offset:57424
+; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v4 offset:57408
+; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v4 offset:57344
+; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v4 offset:57360
+; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v4 offset:57376
+; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v4 offset:57392
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14)
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; EXACTCUTOFF-NEXT: s_nop 12
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
+; EXACTCUTOFF-NEXT: s_nop 11
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
index ddbae64..a95d8c7 100644
--- a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
@@ -1,8 +1,8 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck %s -check-prefixes=GCN,GFX700
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
declare i64 @llvm.readsteadycounter() #0
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 9a23788..8803f3a 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -367,77 +367,76 @@ bb:
define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
; CHECK-LABEL: illegal_mfma_after_rewrite:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_mov_b32 s5, s4
+; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[4:5]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[16:19]
+; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3]
+; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_accvgpr_write_b32 a0, s0
-; CHECK-NEXT: v_accvgpr_write_b32 a1, s1
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7]
-; CHECK-NEXT: s_nop 2
-; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; CHECK-NEXT: v_mov_b32_e32 v5, v4
-; CHECK-NEXT: v_mov_b32_e32 v6, v4
-; CHECK-NEXT: v_mov_b32_e32 v7, v4
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7]
+; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9]
+; CHECK-NEXT: s_nop 3
+; CHECK-NEXT: v_cvt_f16_f32_e32 v24, v4
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3]
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def v[4:7]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7]
-; CHECK-NEXT: s_nop 5
-; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15]
-; CHECK-NEXT: s_nop 2
-; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3]
-; CHECK-NEXT: global_store_short v[12:13], v17, off
+; CHECK-NEXT: v_mov_b32_e32 v8, 0x7fc00000
+; CHECK-NEXT: v_mov_b32_e32 v9, v8
+; CHECK-NEXT: v_mov_b32_e32 v10, v8
+; CHECK-NEXT: v_mov_b32_e32 v11, v8
+; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v6
+; CHECK-NEXT: v_mov_b64_e32 v[0:1], 0
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11]
+; CHECK-NEXT: global_store_short v[0:1], v2, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT: global_store_short v[12:13], v9, off
-; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19]
+; CHECK-NEXT: s_nop 5
+; CHECK-NEXT: v_cvt_f16_f32_e32 v10, v6
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15]
+; CHECK-NEXT: global_store_short v[0:1], v10, off
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5]
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0
-; CHECK-NEXT: global_store_short v[12:13], v1, off
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23]
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CHECK-NEXT: global_store_short v[0:1], v6, off
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23]
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: global_store_short v[12:13], v14, off
+; CHECK-NEXT: global_store_short v[0:1], v24, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5]
; CHECK-NEXT: s_nop 6
-; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7]
-; CHECK-NEXT: global_store_short v[12:13], v8, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v2
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19]
+; CHECK-NEXT: global_store_short v[0:1], v6, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
; CHECK-NEXT: s_nop 2
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT: global_store_short v[12:13], v0, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CHECK-NEXT: global_store_short v[0:1], v2, off
; CHECK-NEXT: s_endpgm
entry:
%k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
@@ -546,100 +545,14 @@ define void @test_rewrite_mfma_subreg_insert2(double %arg0, double %arg1, ptr ad
define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) #0 {
; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class:
; CHECK: ; %bb.0:
+; CHECK-NEXT: v_accvgpr_write_b32 a34, 2.0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_write_b32 a32, v0
-; CHECK-NEXT: v_accvgpr_read_b32 v63, a31
-; CHECK-NEXT: v_accvgpr_read_b32 v62, a30
-; CHECK-NEXT: v_accvgpr_read_b32 v61, a29
-; CHECK-NEXT: v_accvgpr_read_b32 v60, a28
-; CHECK-NEXT: v_accvgpr_read_b32 v59, a27
-; CHECK-NEXT: v_accvgpr_read_b32 v58, a26
-; CHECK-NEXT: v_accvgpr_read_b32 v57, a25
-; CHECK-NEXT: v_accvgpr_read_b32 v56, a24
-; CHECK-NEXT: v_accvgpr_read_b32 v55, a23
-; CHECK-NEXT: v_accvgpr_read_b32 v54, a22
-; CHECK-NEXT: v_accvgpr_read_b32 v53, a21
-; CHECK-NEXT: v_accvgpr_read_b32 v52, a20
-; CHECK-NEXT: v_accvgpr_read_b32 v51, a19
-; CHECK-NEXT: v_accvgpr_read_b32 v50, a18
-; CHECK-NEXT: v_accvgpr_read_b32 v49, a17
-; CHECK-NEXT: v_accvgpr_read_b32 v48, a16
-; CHECK-NEXT: v_accvgpr_read_b32 v47, a15
-; CHECK-NEXT: v_accvgpr_read_b32 v46, a14
-; CHECK-NEXT: v_accvgpr_read_b32 v45, a13
-; CHECK-NEXT: v_accvgpr_read_b32 v44, a12
-; CHECK-NEXT: v_accvgpr_read_b32 v43, a11
-; CHECK-NEXT: v_accvgpr_read_b32 v42, a10
-; CHECK-NEXT: v_accvgpr_read_b32 v41, a9
-; CHECK-NEXT: v_accvgpr_read_b32 v40, a8
-; CHECK-NEXT: v_accvgpr_read_b32 v39, a7
-; CHECK-NEXT: v_accvgpr_read_b32 v38, a6
-; CHECK-NEXT: v_accvgpr_read_b32 v37, a5
-; CHECK-NEXT: v_accvgpr_read_b32 v36, a4
-; CHECK-NEXT: v_accvgpr_read_b32 v35, a3
-; CHECK-NEXT: v_accvgpr_read_b32 v34, a2
-; CHECK-NEXT: v_accvgpr_read_b32 v33, a1
-; CHECK-NEXT: v_accvgpr_read_b32 v32, a0
-; CHECK-NEXT: v_accvgpr_write_b32 a0, 2.0
-; CHECK-NEXT: v_accvgpr_write_b32 a1, 4.0
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[32:63]
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v32
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v33
-; CHECK-NEXT: v_accvgpr_write_b32 a2, v34
-; CHECK-NEXT: v_accvgpr_write_b32 a3, v35
-; CHECK-NEXT: v_accvgpr_write_b32 a4, v36
-; CHECK-NEXT: v_accvgpr_write_b32 a5, v37
-; CHECK-NEXT: v_accvgpr_write_b32 a6, v38
-; CHECK-NEXT: v_accvgpr_write_b32 a7, v39
-; CHECK-NEXT: v_accvgpr_write_b32 a8, v40
-; CHECK-NEXT: v_accvgpr_write_b32 a9, v41
-; CHECK-NEXT: v_accvgpr_write_b32 a10, v42
-; CHECK-NEXT: v_accvgpr_write_b32 a11, v43
-; CHECK-NEXT: v_accvgpr_write_b32 a12, v44
-; CHECK-NEXT: v_accvgpr_write_b32 a13, v45
-; CHECK-NEXT: v_accvgpr_write_b32 a14, v46
-; CHECK-NEXT: v_accvgpr_write_b32 a15, v47
-; CHECK-NEXT: v_accvgpr_write_b32 a16, v48
-; CHECK-NEXT: v_accvgpr_write_b32 a17, v49
-; CHECK-NEXT: v_accvgpr_write_b32 a18, v50
-; CHECK-NEXT: v_accvgpr_write_b32 a19, v51
-; CHECK-NEXT: v_accvgpr_write_b32 a20, v52
-; CHECK-NEXT: v_accvgpr_write_b32 a21, v53
-; CHECK-NEXT: v_accvgpr_write_b32 a22, v54
-; CHECK-NEXT: v_accvgpr_write_b32 a23, v55
-; CHECK-NEXT: v_accvgpr_write_b32 a24, v56
-; CHECK-NEXT: v_accvgpr_write_b32 a25, v57
-; CHECK-NEXT: v_accvgpr_write_b32 a26, v58
-; CHECK-NEXT: v_accvgpr_write_b32 a27, v59
-; CHECK-NEXT: v_accvgpr_write_b32 a28, v60
-; CHECK-NEXT: v_accvgpr_write_b32 a29, v61
-; CHECK-NEXT: v_accvgpr_write_b32 a30, v62
-; CHECK-NEXT: v_accvgpr_write_b32 a31, v63
-; CHECK-NEXT: v_mov_b32_e32 v33, 0x41000000
-; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000
-; CHECK-NEXT: v_accvgpr_read_b32 v32, a32
-; CHECK-NEXT: v_and_b32_e32 v32, 0x3ff, v32
-; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31]
-; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v32
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
-; CHECK-NEXT: s_nop 7
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT: v_accvgpr_read_b32 v24, a24
-; CHECK-NEXT: v_accvgpr_read_b32 v25, a25
-; CHECK-NEXT: v_accvgpr_read_b32 v26, a26
-; CHECK-NEXT: v_accvgpr_read_b32 v27, a27
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
@@ -663,18 +576,60 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add
; CHECK-NEXT: v_accvgpr_read_b32 v21, a21
; CHECK-NEXT: v_accvgpr_read_b32 v22, a22
; CHECK-NEXT: v_accvgpr_read_b32 v23, a23
+; CHECK-NEXT: v_accvgpr_read_b32 v24, a24
+; CHECK-NEXT: v_accvgpr_read_b32 v25, a25
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a26
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a27
; CHECK-NEXT: v_accvgpr_read_b32 v28, a28
; CHECK-NEXT: v_accvgpr_read_b32 v29, a29
; CHECK-NEXT: v_accvgpr_read_b32 v30, a30
; CHECK-NEXT: v_accvgpr_read_b32 v31, a31
-; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3]
-; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; CHECK-NEXT: v_accvgpr_write_b32 a33, 4.0
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a34, a33, v[0:31]
+; CHECK-NEXT: v_mov_b32_e32 v1, 0x41000000
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a32
+; CHECK-NEXT: s_nop 15
+; CHECK-NEXT: v_mov_b64_e32 v[2:3], v[32:33]
+; CHECK-NEXT: v_mov_b64_e32 v[4:5], v[34:35]
+; CHECK-NEXT: v_mov_b64_e32 v[6:7], v[36:37]
+; CHECK-NEXT: v_mov_b64_e32 v[8:9], v[38:39]
+; CHECK-NEXT: v_mov_b64_e32 v[10:11], v[40:41]
+; CHECK-NEXT: v_mov_b64_e32 v[12:13], v[42:43]
+; CHECK-NEXT: v_mov_b64_e32 v[14:15], v[44:45]
+; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[46:47]
+; CHECK-NEXT: v_mov_b64_e32 v[18:19], v[48:49]
+; CHECK-NEXT: v_mov_b64_e32 v[20:21], v[50:51]
+; CHECK-NEXT: v_mov_b64_e32 v[22:23], v[52:53]
+; CHECK-NEXT: v_mov_b64_e32 v[24:25], v[54:55]
+; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[56:57]
+; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[58:59]
+; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[60:61]
+; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[62:63]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112
+; CHECK-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96
+; CHECK-NEXT: global_store_dwordx4 v0, v[22:25], s[0:1] offset:80
+; CHECK-NEXT: global_store_dwordx4 v0, v[18:21], s[0:1] offset:64
+; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_mov_b32_e32 v2, 0x41800000
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; CHECK-NEXT: s_nop 15
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[2:3] offset:96
+; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[2:3] offset:112
+; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[2:3] offset:64
+; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[2:3] offset:80
+; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
+; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
+; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
+; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
; CHECK-NEXT: s_endpgm
%src2 = call <32 x float> asm sideeffect "; def $0", "=a"()
%mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir
new file mode 100644
index 0000000..33b2f69
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir
@@ -0,0 +1,32 @@
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler --misched-prera-direction=topdown -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Check that cycle counts are consistent with hazards.
+
+# CHECK: Cycle: 3 TopQ.A
+# CHECK: hazard: SU(6) HWXDL[0]=9c, is later than CurrCycle = 3c
+# CHECK-NOT: Cycle: 9 TopQ.A
+# CHECK: Cycle: 83 TopQ.A
+# CHECK: Checking pending node SU(6)
+# CHECK: Move SU(6) into Available Q
+
+---
+name: pending_queue_ready_cycle
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr4_sgpr5
+
+ %2:sgpr_128 = IMPLICIT_DEF
+ %14:vgpr_32 = IMPLICIT_DEF
+ %15:vgpr_32 = IMPLICIT_DEF
+ %18:areg_512 = IMPLICIT_DEF
+ %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec
+ %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, implicit $exec
+ %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec
+ undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %5.sub0, %14, implicit $exec
+ %7:vreg_512 = COPY %18
+ SCHED_BARRIER 0
+ S_NOP 0, implicit %18, implicit %7, implicit %84
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll
index 71dcf11..196560f 100644
--- a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll
@@ -11,11 +11,11 @@ declare void @f16_user(half)
; CHECK-SAME: in function four64
; CHECK-SAME: Type mismatch between intrinsic and DXIL op
define void @four64() "hlsl.export" {
- %buffer = call target("dx.CBuffer", target("dx.Layout", {double}, 8, 0))
+ %buffer = call target("dx.CBuffer", <{ double }>)
@llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
%load = call {double, double, double, double} @llvm.dx.resource.load.cbufferrow.4(
- target("dx.CBuffer", target("dx.Layout", {double}, 8, 0)) %buffer,
+ target("dx.CBuffer", <{ double }>) %buffer,
i32 0)
%data = extractvalue {double, double, double, double} %load, 0
@@ -28,11 +28,11 @@ define void @four64() "hlsl.export" {
; CHECK-SAME: in function two32
; CHECK-SAME: Type mismatch between intrinsic and DXIL op
define void @two32() "hlsl.export" {
- %buffer = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+ %buffer = call target("dx.CBuffer", <{ float }>)
@llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
%load = call {float, float} @llvm.dx.resource.load.cbufferrow.2(
- target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) %buffer,
+ target("dx.CBuffer", <{ float }>) %buffer,
i32 0)
%data = extractvalue {float, float} %load, 0
@@ -41,5 +41,5 @@ define void @two32() "hlsl.export" {
ret void
}
-declare { double, double, double, double } @llvm.dx.resource.load.cbufferrow.4.f64.f64.f64.f64.tdx.CBuffer_tdx.Layout_sl_f64s_8_0tt(target("dx.CBuffer", target("dx.Layout", { double }, 8, 0)), i32)
-declare { float, float } @llvm.dx.resource.load.cbufferrow.2.f32.f32.tdx.CBuffer_tdx.Layout_sl_f32s_4_0tt(target("dx.CBuffer", target("dx.Layout", { float }, 4, 0)), i32)
+declare { double, double, double, double } @llvm.dx.resource.load.cbufferrow.4.f64.f64.f64.f64.tdx.CBuffer_sl_f64st(target("dx.CBuffer", <{ double }>), i32)
+declare { float, float } @llvm.dx.resource.load.cbufferrow.2.f32.f32.tdx.CBuffer_sl_f32st(target("dx.CBuffer", <{ float }>), i32)
diff --git a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll
index d690651..dd40aa8 100644
--- a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll
@@ -8,12 +8,12 @@ declare void @f16_user(half)
; CHECK-LABEL: define void @loadf32
define void @loadf32() {
- %buffer = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+ %buffer = call target("dx.CBuffer", <{ float }>)
@llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %{{.*}}, i32 0)
%load = call {float, float, float, float} @llvm.dx.resource.load.cbufferrow.4(
- target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) %buffer,
+ target("dx.CBuffer", <{ float }>) %buffer,
i32 0)
%data = extractvalue {float, float, float, float} %load, 0
@@ -27,12 +27,12 @@ define void @loadf32() {
; CHECK-LABEL: define void @loadf64
define void @loadf64() {
%buffer = call
- target("dx.CBuffer", target("dx.Layout", {double, double, double, double}, 64, 0, 8, 16, 24))
+ target("dx.CBuffer", <{ <4 x double> }>)
@llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %{{.*}}, i32 1)
%load = call {double, double} @llvm.dx.resource.load.cbufferrow.2(
- target("dx.CBuffer", target("dx.Layout", {double, double, double, double}, 64, 0, 8, 16, 24)) %buffer,
+ target("dx.CBuffer", <{ <4 x double> }>) %buffer,
i32 1)
%data = extractvalue {double, double} %load, 1
@@ -46,12 +46,12 @@ define void @loadf64() {
; CHECK-LABEL: define void @loadf16
define void @loadf16() {
%buffer = call
- target("dx.CBuffer", target("dx.Layout", {half}, 2, 0))
+ target("dx.CBuffer", <{ half }>)
@llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %{{.*}}, i32 0)
%load = call {half, half, half, half, half, half, half, half} @llvm.dx.resource.load.cbufferrow.8(
- target("dx.CBuffer", target("dx.Layout", {half}, 2, 0)) %buffer,
+ target("dx.CBuffer", <{ half }>) %buffer,
i32 0)
%data = extractvalue {half, half, half, half, half, half, half, half} %load, 0
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll
index bcf82a6..5cd67be 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll
@@ -18,7 +18,7 @@ define void @main() #0 {
%srv0 = call target("dx.RawBuffer", i8, 0, 0)
@llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
i32 1, i32 8, i32 1, i32 0, ptr null)
- %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+ %cbuf = call target("dx.CBuffer", <{ float }>)
@llvm.dx.resource.handlefrombinding(i32 3, i32 2, i32 1, i32 0, ptr null)
ret void
}
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
index 70224fc..d792078 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
@@ -14,7 +14,7 @@ define void @main() #0 {
; CHECK: Kind: CBuffer
; CHECK: Flags:
; CHECK: UsedByAtomic64: false
- %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+ %cbuf = call target("dx.CBuffer", <{ float }>)
@llvm.dx.resource.handlefrombinding(i32 3, i32 2, i32 1, i32 0, ptr null)
; ByteAddressBuffer Buf : register(t8, space1)
diff --git a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
index 38f2de2..671fcef 100644
--- a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
+++ b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
@@ -72,7 +72,7 @@ define void @test_bindings() {
; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF5]], %dx.types.ResourceProperties { i32 10, i32 1033 }) #[[#ATTR]]
; cbuffer cb0 : register(b0) { int4 i; float4 f; }
- %cb0 = call target("dx.CBuffer", target("dx.Layout", {<4 x i32>, <4 x float>}, 32, 0, 16))
+ %cb0 = call target("dx.CBuffer", <{ <4 x i32>, <4 x float> }>)
@llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
; CHECK: [[BUF6:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 2 }, i32 0, i1 false) #[[#ATTR]]
; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF6]], %dx.types.ResourceProperties { i32 13, i32 32 }) #[[#ATTR]]
diff --git a/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll b/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll
index 26b157f..d674863 100644
--- a/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll
+++ b/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll
@@ -4,27 +4,27 @@
%__cblayout_CB2 = type <{ float }>
%struct.Scalars = type { float, i32, i32 }
-@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) poison
-@CB2.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) poison
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+@CB2.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB2) poison
define void @main() local_unnamed_addr #1 {
entry:
; CHECK: [[CB:%.*]] = tail call target({{.*}}) @llvm.dx.resource.handlefrombinding
- %h = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
- store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) %h, ptr @CB.cb, align 4
+ %h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+ store target("dx.CBuffer", %__cblayout_CB) %h, ptr @CB.cb, align 4
%_ZL3Out_h.i.i = tail call target("dx.RawBuffer", %struct.Scalars, 1, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
; CHECK-NOT: load target({{.*}}), ptr @CB.cb
- %cb = load target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)), ptr @CB.cb, align 4
+ %cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
; CHECK: call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target({{.*}}) [[CB]], i32 0)
- %0 = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4(target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) %cb, i32 0)
+ %0 = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4(target("dx.CBuffer", %__cblayout_CB) %cb, i32 0)
%1 = extractvalue { float, float, float, float } %0, 0
call void @llvm.dx.resource.store.rawbuffer(target("dx.RawBuffer", %struct.Scalars, 1, 0) %_ZL3Out_h.i.i, i32 0, i32 0, float %1)
-
+
; CHECK: [[CB2:%.*]] = tail call target({{.*}}) @llvm.dx.resource.handlefromimplicitbinding
- %h2 = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) @llvm.dx.resource.handlefromimplicitbinding(i32 100, i32 0, i32 1, i32 0, ptr null)
- store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) %h2, ptr @CB2.cb, align 4
+ %h2 = tail call target("dx.CBuffer", %__cblayout_CB2) @llvm.dx.resource.handlefromimplicitbinding(i32 100, i32 0, i32 1, i32 0, ptr null)
+ store target("dx.CBuffer", %__cblayout_CB2) %h2, ptr @CB2.cb, align 4
; CHECK-NOT: load target({{.*}}), ptr @CB2.cb
- %cb2 = load target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)), ptr @CB2.cb, align 4
+ %cb2 = load target("dx.CBuffer", %__cblayout_CB2), ptr @CB2.cb, align 4
ret void
}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-layouttype.ll
index f1d28e2..85952c9 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-layouttype.ll
@@ -1,3 +1,6 @@
+; TODO: Remove this test once we've updated the frontend to use explicit
+; padding. The cbuffer-metadata.ll test covers the newer logic.
+
; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s
; RUN: opt -S --passes="dxil-pretty-printer" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
; RUN: llc %s --filetype=asm -o - < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll
new file mode 100644
index 0000000..6b90e17
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll
@@ -0,0 +1,89 @@
+; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s
+; RUN: opt -S --passes="dxil-pretty-printer" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
+; RUN: llc %s --filetype=asm -o - < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+%__cblayout_CB1 = type <{ float, i32, double, <2 x i32> }>
+@CB1.cb = global target("dx.CBuffer", %__cblayout_CB1) poison
+@CB1.str = private unnamed_addr constant [4 x i8] c"CB1\00", align 1
+
+%__cblayout_CB2 = type <{ float, target("dx.Padding", 4), double, float, half, i16, i64, i32 }>
+@CB2.cb = global target("dx.CBuffer", %__cblayout_CB2) poison
+@CB2.str = private unnamed_addr constant [4 x i8] c"CB2\00", align 1
+
+%__cblayout_MyConstants = type <{
+ double, target("dx.Padding", 8),
+ <3 x float>, float,
+ <3 x double>, half, target("dx.Padding", 6),
+ <2 x double>,
+ float, <3 x half>, <3 x half>
+}>
+@MyConstants.cb = global target("dx.CBuffer", %__cblayout_MyConstants) poison
+@MyConstants.str = private unnamed_addr constant [12 x i8] c"MyConstants\00", align 1
+
+; PRINT:; Resource Bindings:
+; PRINT-NEXT:;
+; PRINT-NEXT:; Name Type Format Dim ID HLSL Bind Count
+; PRINT-NEXT:; ----
+; PRINT-NEXT:; CB1 cbuffer NA NA CB0 cb0 1
+; PRINT-NEXT:; CB2 cbuffer NA NA CB1 cb1 1
+; PRINT-NEXT:; MyConstants cbuffer NA NA CB2 cb5,space15 1
+
+define void @test() #0 {
+
+ ; cbuffer CB1 : register(b0) {
+ ; float a;
+ ; int b;
+ ; double c;
+ ; int2 d;
+ ; }
+ %CB1.cb_h = call target("dx.CBuffer", %__cblayout_CB1)
+ @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr @CB1.str)
+
+ ; cbuffer CB2 : register(b0) {
+ ; float a;
+ ; double b;
+ ; float c;
+ ; half d;
+ ; uint16_t e;
+ ; int64_t f;
+ ; int g;
+ ;}
+ %CB2.cb_h = call target("dx.CBuffer", %__cblayout_CB2)
+ @llvm.dx.resource.handlefrombinding(i32 0, i32 1, i32 1, i32 0, ptr @CB2.str)
+
+ ; cbuffer CB3 : register(b5) {
+ ; double B0;
+ ; float3 B1;
+ ; float B2;
+ ; double3 B3;
+ ; half B4;
+ ; double2 B5;
+ ; float B6;
+ ; half3 B7;
+ ; half3 B8;
+ ; }
+ %CB3.cb_h = call target("dx.CBuffer", %__cblayout_MyConstants)
+ @llvm.dx.resource.handlefrombinding(i32 15, i32 5, i32 1, i32 0, ptr @MyConstants.str)
+
+ ret void
+}
+
+attributes #0 = { noinline nounwind "hlsl.shader"="compute" }
+
+; CHECK: %CBuffer.CB1 = type { { float, i32, double, <2 x i32> } }
+; CHECK: %CBuffer.CB2 = type { { float, double, float, half, i16, i64, i32 } }
+; CHECK: %CBuffer.MyConstants = type { { double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> } }
+
+; CHECK: @CB1 = external constant %CBuffer.CB1
+; CHECK: @CB2 = external constant %CBuffer.CB2
+; CHECK: @MyConstants = external constant %CBuffer.MyConstants
+
+; CHECK: !dx.resources = !{[[ResList:[!][0-9]+]]}
+
+; CHECK: [[ResList]] = !{null, null, [[CBList:[!][0-9]+]], null}
+; CHECK: [[CBList]] = !{![[CB1:[0-9]+]], ![[CB2:[0-9]+]], ![[MYCONSTANTS:[0-9]+]]}
+; CHECK: ![[CB1]] = !{i32 0, ptr @CB1, !"CB1", i32 0, i32 0, i32 1, i32 24, null}
+; CHECK: ![[CB2]] = !{i32 1, ptr @CB2, !"CB2", i32 0, i32 1, i32 1, i32 36, null}
+; CHECK: ![[MYCONSTANTS]] = !{i32 2, ptr @MyConstants, !"MyConstants", i32 15, i32 5, i32 1, i32 96, null}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
index e2a1c09..0b454c1 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
@@ -7,7 +7,7 @@
target triple = "dxil-pc-shadermodel6.6-compute"
define void @cbuffer_is_only_binding() {
- %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+ %cbuf = call target("dx.CBuffer", <{ float }>)
@llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, ptr null)
; CHECK: %CBuffer = type { float }
diff --git a/llvm/test/CodeGen/Hexagon/swp-many-stores.mir b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir
new file mode 100644
index 0000000..bf14dcf
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir
@@ -0,0 +1,88 @@
+# RUN: llc -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null -pipeliner-max-num-stores=5 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# This loop has six stores, which exceeds the limit set by
+# `pipeliner-max-num-stores`.
+
+# CHECK: Too many stores
+
+--- |
+ target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+ target triple = "hexagon-unknown-linux-musl"
+
+ define void @f(ptr %a, i32 %n) #0 {
+ entry:
+ %guard = icmp sgt i32 %n, 0
+ %btc = sub nsw i32 %n, 1
+ br i1 %guard, label %loop.preheader, label %exit
+
+ loop.preheader: ; preds = %entry
+ %0 = add i32 %n, 1
+ %cgep = getelementptr i8, ptr %a, i32 %0
+ br label %loop
+
+ loop: ; preds = %loop.preheader, %loop
+ %lsr.iv = phi ptr [ %cgep, %loop.preheader ], [ %cgep8, %loop ]
+ %i = phi i32 [ %i.dec, %loop ], [ %btc, %loop.preheader ]
+ %cgep7 = getelementptr i8, ptr %lsr.iv, i32 -2
+ store i8 0, ptr %cgep7, align 1
+ %cgep8 = getelementptr i8, ptr %lsr.iv, i32 -1
+ store i8 1, ptr %cgep8, align 1
+ store i8 2, ptr %lsr.iv, align 1
+ %cgep9 = getelementptr i8, ptr %lsr.iv, i32 1
+ store i8 3, ptr %cgep9, align 1
+ %cgep10 = getelementptr i8, ptr %lsr.iv, i32 2
+ store i8 4, ptr %cgep10, align 1
+ %cgep11 = getelementptr i8, ptr %lsr.iv, i32 3
+ store i8 5, ptr %cgep11, align 1
+ %i.dec = sub i32 %i, 1
+ %ec = icmp eq i32 %i.dec, 0
+ br i1 %ec, label %exit, label %loop
+
+ exit: ; preds = %loop, %entry
+ ret void
+ }
+
+ attributes #0 = { "target-cpu"="hexagonv79" }
+...
+---
+name: f
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x50000000), %bb.3(0x30000000)
+ liveins: $r0, $r1
+
+ %7:intregs = COPY $r1
+ %6:intregs = COPY $r0
+ %8:predregs = C2_cmpgti %7, 0
+ J2_jumpf %8, %bb.3, implicit-def dead $pc
+ J2_jump %bb.1, implicit-def dead $pc
+
+ bb.1.loop.preheader:
+ successors: %bb.2(0x80000000)
+
+ %0:intregs = A2_addi %7, -1
+ %1:intregs = S4_addaddi %7, %6, 1
+ %10:intregs = A2_tfrsi 0
+ %11:intregs = A2_tfrsi 1
+ %14:intregs = COPY %0
+ J2_loop0r %bb.2, %14, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+
+ bb.2.loop (machine-block-address-taken):
+ successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+ %2:intregs = PHI %1, %bb.1, %4, %bb.2
+ S2_storerb_io %2, -2, %10 :: (store (s8) into %ir.cgep7)
+ %4:intregs = A2_addi %2, -1
+ S2_storerb_io %2, -1, %11 :: (store (s8) into %ir.cgep8)
+ S4_storeirb_io %2, 0, 2 :: (store (s8) into %ir.lsr.iv)
+ S4_storeirb_io %2, 1, 3 :: (store (s8) into %ir.cgep9)
+ S4_storeirb_io %2, 2, 4 :: (store (s8) into %ir.cgep10)
+ S4_storeirb_io %2, 3, 5 :: (store (s8) into %ir.cgep11)
+ ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+ J2_jump %bb.3, implicit-def dead $pc
+
+ bb.3.exit:
+ PS_jmpret $r31, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll
new file mode 100644
index 0000000..e67d031
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; The addition of vector `A` with vector of 1s currently uses `vspltisw` to generate vector of 1s followed by add operation.
+
+; Function for the vector type v2i64 `a + {1, 1}`
+define <2 x i64> @test_v2i64(<2 x i64> %a) {
+; CHECK-LABEL: test_v2i64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vspltisw v3, 1
+; CHECK-NEXT: vupklsw v3, v3
+; CHECK-NEXT: vaddudm v2, v2, v3
+; CHECK-NEXT: blr
+entry:
+ %add = add <2 x i64> %a, splat (i64 1)
+ ret <2 x i64> %add
+}
+
+; Function for the vector type v4i32 `a + {1, 1, 1, 1}`
+define <4 x i32> @test_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: test_v4i32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vspltisw v3, 1
+; CHECK-NEXT: vadduwm v2, v2, v3
+; CHECK-NEXT: blr
+entry:
+ %add = add <4 x i32> %a, splat (i32 1)
+ ret <4 x i32> %add
+}
+
+; Function for the vector type v8i16 `a + {1, 1, 1, 1, 1, 1, 1, 1}`
+define <8 x i16> @test_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: test_v8i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vspltish v3, 1
+; CHECK-NEXT: vadduhm v2, v2, v3
+; CHECK-NEXT: blr
+entry:
+ %add = add <8 x i16> %a, splat (i16 1)
+ ret <8 x i16> %add
+}
+
+; Function for the vector type v16i8 `a + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}`
+define <16 x i8> @test_16i8(<16 x i8> %a) {
+; CHECK-LABEL: test_16i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xxspltib v3, 1
+; CHECK-NEXT: vaddubm v2, v2, v3
+; CHECK-NEXT: blr
+entry:
+ %add = add <16 x i8> %a, splat (i8 1)
+ ret <16 x i8> %add
+}
diff --git a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/vector-all-ones.ll
deleted file mode 100644
index e4c93adc..0000000
--- a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
-; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
-
-; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
-; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
-
-; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
-; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
-
-; Currently the generated code uses `vspltisw` to generate vector of 1s followed by add operation.
-; This pattern is expected to be optimized in a future patch by using `xxleqv` to generate vector of -1s
-; followed by subtraction operation.
-define dso_local noundef <4 x i32> @test1(<4 x i32> %a) {
-; CHECK-LABEL: test1:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vspltisw v3, 1
-; CHECK-NEXT: vadduwm v2, v2, v3
-; CHECK-NEXT: blr
-entry:
- %add = add <4 x i32> %a, splat (i32 1)
- ret <4 x i32> %add
-}
diff --git a/llvm/test/CodeGen/RISCV/idiv_large.ll b/llvm/test/CodeGen/RISCV/idiv_large.ll
index 9937627..d7b00f6 100644
--- a/llvm/test/CodeGen/RISCV/idiv_large.ll
+++ b/llvm/test/CodeGen/RISCV/idiv_large.ll
@@ -1,16 +1,2315 @@
-; RUN: llc -mtriple=riscv32 < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv32 < %s | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 < %s | FileCheck %s --check-prefix=RV64
+
+define i64 @udiv_i64(i64 %x, i64 %y) nounwind {
+; RV32-LABEL: udiv_i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: call __udivdi3
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i64:
+; RV64: # %bb.0:
+; RV64-NEXT: tail __udivdi3
+ %res = udiv i64 %x, %y
+ ret i64 %res
+}
+
+define i65 @udiv_i65(i65 %x, i65 %y) nounwind {
+; RV32-LABEL: udiv_i65:
+; RV32: # %bb.0: # %_udiv-special-cases
+; RV32-NEXT: lw a3, 0(a2)
+; RV32-NEXT: lw a4, 4(a2)
+; RV32-NEXT: lw t1, 8(a2)
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: lui a5, 209715
+; RV32-NEXT: lui a6, 61681
+; RV32-NEXT: addi t0, a2, 1365
+; RV32-NEXT: addi a7, a5, 819
+; RV32-NEXT: addi a6, a6, -241
+; RV32-NEXT: srli a2, a4, 1
+; RV32-NEXT: slli a5, t1, 31
+; RV32-NEXT: slli t3, a4, 31
+; RV32-NEXT: or t2, a5, a2
+; RV32-NEXT: srli a2, a3, 1
+; RV32-NEXT: or t4, a2, t3
+; RV32-NEXT: bnez t2, .LBB1_2
+; RV32-NEXT: # %bb.1: # %_udiv-special-cases
+; RV32-NEXT: srli a2, t4, 1
+; RV32-NEXT: or a2, t4, a2
+; RV32-NEXT: srli a5, a2, 2
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 4
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 8
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 16
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: not a2, a2
+; RV32-NEXT: srli a5, a2, 1
+; RV32-NEXT: and a5, a5, t0
+; RV32-NEXT: sub a2, a2, a5
+; RV32-NEXT: and a5, a2, a7
+; RV32-NEXT: srli a2, a2, 2
+; RV32-NEXT: and a2, a2, a7
+; RV32-NEXT: add a2, a5, a2
+; RV32-NEXT: srli a5, a2, 4
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: and a2, a2, a6
+; RV32-NEXT: slli a5, a2, 8
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: slli a5, a2, 16
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: srli a2, a2, 24
+; RV32-NEXT: addi t3, a2, 32
+; RV32-NEXT: j .LBB1_3
+; RV32-NEXT: .LBB1_2:
+; RV32-NEXT: srli a2, t2, 1
+; RV32-NEXT: or a2, t2, a2
+; RV32-NEXT: srli a5, a2, 2
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 4
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 8
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 16
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: not a2, a2
+; RV32-NEXT: srli a5, a2, 1
+; RV32-NEXT: and a5, a5, t0
+; RV32-NEXT: sub a2, a2, a5
+; RV32-NEXT: and a5, a2, a7
+; RV32-NEXT: srli a2, a2, 2
+; RV32-NEXT: and a2, a2, a7
+; RV32-NEXT: add a2, a5, a2
+; RV32-NEXT: srli a5, a2, 4
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: and a2, a2, a6
+; RV32-NEXT: slli a5, a2, 8
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: slli a5, a2, 16
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: srli t3, a2, 24
+; RV32-NEXT: .LBB1_3: # %_udiv-special-cases
+; RV32-NEXT: addi sp, sp, -96
+; RV32-NEXT: sw s0, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 88(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 84(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 80(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT: slli a2, a3, 31
+; RV32-NEXT: li t5, 64
+; RV32-NEXT: bnez a2, .LBB1_5
+; RV32-NEXT: # %bb.4: # %_udiv-special-cases
+; RV32-NEXT: li s0, 64
+; RV32-NEXT: j .LBB1_6
+; RV32-NEXT: .LBB1_5:
+; RV32-NEXT: srli a5, a2, 1
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 2
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 4
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 8
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 16
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: not a2, a2
+; RV32-NEXT: srli a5, a2, 1
+; RV32-NEXT: and a5, a5, t0
+; RV32-NEXT: sub a2, a2, a5
+; RV32-NEXT: and a5, a2, a7
+; RV32-NEXT: srli a2, a2, 2
+; RV32-NEXT: and a2, a2, a7
+; RV32-NEXT: add a2, a5, a2
+; RV32-NEXT: srli a5, a2, 4
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: and a2, a2, a6
+; RV32-NEXT: slli a5, a2, 8
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: slli a5, a2, 16
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: srli s0, a2, 24
+; RV32-NEXT: .LBB1_6: # %_udiv-special-cases
+; RV32-NEXT: lw a5, 0(a1)
+; RV32-NEXT: lw a2, 4(a1)
+; RV32-NEXT: lw s2, 8(a1)
+; RV32-NEXT: or a1, t4, t2
+; RV32-NEXT: addi s1, s0, 64
+; RV32-NEXT: bnez a1, .LBB1_8
+; RV32-NEXT: # %bb.7: # %_udiv-special-cases
+; RV32-NEXT: mv t3, s1
+; RV32-NEXT: .LBB1_8: # %_udiv-special-cases
+; RV32-NEXT: snez s4, a1
+; RV32-NEXT: srli a1, a2, 1
+; RV32-NEXT: slli t2, s2, 31
+; RV32-NEXT: slli t4, a2, 31
+; RV32-NEXT: or a1, t2, a1
+; RV32-NEXT: srli t2, a5, 1
+; RV32-NEXT: or t6, t2, t4
+; RV32-NEXT: bnez a1, .LBB1_10
+; RV32-NEXT: # %bb.9: # %_udiv-special-cases
+; RV32-NEXT: srli t2, t6, 1
+; RV32-NEXT: or t2, t6, t2
+; RV32-NEXT: srli t4, t2, 2
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: srli t4, t2, 4
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: srli t4, t2, 8
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: srli t4, t2, 16
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: not t2, t2
+; RV32-NEXT: srli t4, t2, 1
+; RV32-NEXT: and t4, t4, t0
+; RV32-NEXT: sub t2, t2, t4
+; RV32-NEXT: and t4, t2, a7
+; RV32-NEXT: srli t2, t2, 2
+; RV32-NEXT: and t2, t2, a7
+; RV32-NEXT: add t2, t4, t2
+; RV32-NEXT: srli t4, t2, 4
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: and t2, t2, a6
+; RV32-NEXT: slli t4, t2, 8
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: slli t4, t2, 16
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: srli t2, t2, 24
+; RV32-NEXT: addi s3, t2, 32
+; RV32-NEXT: j .LBB1_11
+; RV32-NEXT: .LBB1_10:
+; RV32-NEXT: srli t2, a1, 1
+; RV32-NEXT: or t2, a1, t2
+; RV32-NEXT: srli t4, t2, 2
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: srli t4, t2, 4
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: srli t4, t2, 8
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: srli t4, t2, 16
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: not t2, t2
+; RV32-NEXT: srli t4, t2, 1
+; RV32-NEXT: and t4, t4, t0
+; RV32-NEXT: sub t2, t2, t4
+; RV32-NEXT: and t4, t2, a7
+; RV32-NEXT: srli t2, t2, 2
+; RV32-NEXT: and t2, t2, a7
+; RV32-NEXT: add t2, t4, t2
+; RV32-NEXT: srli t4, t2, 4
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: and t2, t2, a6
+; RV32-NEXT: slli t4, t2, 8
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: slli t4, t2, 16
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: srli s3, t2, 24
+; RV32-NEXT: .LBB1_11: # %_udiv-special-cases
+; RV32-NEXT: andi t4, s2, 1
+; RV32-NEXT: andi t1, t1, 1
+; RV32-NEXT: or t2, a3, a4
+; RV32-NEXT: or s2, a5, a2
+; RV32-NEXT: sltu s0, s1, s0
+; RV32-NEXT: slli s1, a5, 31
+; RV32-NEXT: addi s4, s4, -1
+; RV32-NEXT: beqz s1, .LBB1_13
+; RV32-NEXT: # %bb.12:
+; RV32-NEXT: srli t5, s1, 1
+; RV32-NEXT: or t5, s1, t5
+; RV32-NEXT: srli s1, t5, 2
+; RV32-NEXT: or t5, t5, s1
+; RV32-NEXT: srli s1, t5, 4
+; RV32-NEXT: or t5, t5, s1
+; RV32-NEXT: srli s1, t5, 8
+; RV32-NEXT: or t5, t5, s1
+; RV32-NEXT: srli s1, t5, 16
+; RV32-NEXT: or t5, t5, s1
+; RV32-NEXT: not t5, t5
+; RV32-NEXT: srli s1, t5, 1
+; RV32-NEXT: and t0, s1, t0
+; RV32-NEXT: sub t0, t5, t0
+; RV32-NEXT: and t5, t0, a7
+; RV32-NEXT: srli t0, t0, 2
+; RV32-NEXT: and a7, t0, a7
+; RV32-NEXT: add a7, t5, a7
+; RV32-NEXT: srli t0, a7, 4
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: and a6, a7, a6
+; RV32-NEXT: slli a7, a6, 8
+; RV32-NEXT: add a6, a6, a7
+; RV32-NEXT: slli a7, a6, 16
+; RV32-NEXT: add a6, a6, a7
+; RV32-NEXT: srli t5, a6, 24
+; RV32-NEXT: .LBB1_13: # %_udiv-special-cases
+; RV32-NEXT: or t0, t2, t1
+; RV32-NEXT: or a6, s2, t4
+; RV32-NEXT: and a7, s4, s0
+; RV32-NEXT: or t6, t6, a1
+; RV32-NEXT: addi s0, t5, 64
+; RV32-NEXT: bnez t6, .LBB1_15
+; RV32-NEXT: # %bb.14: # %_udiv-special-cases
+; RV32-NEXT: mv s3, s0
+; RV32-NEXT: .LBB1_15: # %_udiv-special-cases
+; RV32-NEXT: seqz a1, t0
+; RV32-NEXT: sltu t0, s0, t5
+; RV32-NEXT: snez t5, t6
+; RV32-NEXT: addi t5, t5, -1
+; RV32-NEXT: and t0, t5, t0
+; RV32-NEXT: sltu t5, t3, s3
+; RV32-NEXT: seqz a6, a6
+; RV32-NEXT: mv t6, t5
+; RV32-NEXT: beq a7, t0, .LBB1_17
+; RV32-NEXT: # %bb.16: # %_udiv-special-cases
+; RV32-NEXT: sltu t6, a7, t0
+; RV32-NEXT: .LBB1_17: # %_udiv-special-cases
+; RV32-NEXT: or a1, a1, a6
+; RV32-NEXT: andi a6, t6, 1
+; RV32-NEXT: sub a7, a7, t0
+; RV32-NEXT: sub t5, a7, t5
+; RV32-NEXT: sub a7, t3, s3
+; RV32-NEXT: beqz a6, .LBB1_19
+; RV32-NEXT: # %bb.18: # %_udiv-special-cases
+; RV32-NEXT: mv t0, a6
+; RV32-NEXT: j .LBB1_20
+; RV32-NEXT: .LBB1_19:
+; RV32-NEXT: sltiu t0, a7, 65
+; RV32-NEXT: xori t0, t0, 1
+; RV32-NEXT: snez t3, t5
+; RV32-NEXT: or t0, t0, t3
+; RV32-NEXT: .LBB1_20: # %_udiv-special-cases
+; RV32-NEXT: or t6, a1, t0
+; RV32-NEXT: addi a1, t6, -1
+; RV32-NEXT: and t3, t4, a1
+; RV32-NEXT: and t0, a1, a2
+; RV32-NEXT: and a1, a1, a5
+; RV32-NEXT: bnez t6, .LBB1_30
+; RV32-NEXT: # %bb.21: # %_udiv-special-cases
+; RV32-NEXT: xori t6, a7, 64
+; RV32-NEXT: or t6, t6, a6
+; RV32-NEXT: or t6, t6, t5
+; RV32-NEXT: beqz t6, .LBB1_30
+; RV32-NEXT: # %bb.22: # %udiv-bb1
+; RV32-NEXT: addi a1, a7, 1
+; RV32-NEXT: sw zero, 32(sp)
+; RV32-NEXT: sw zero, 36(sp)
+; RV32-NEXT: sw zero, 40(sp)
+; RV32-NEXT: sw zero, 44(sp)
+; RV32-NEXT: sw a5, 48(sp)
+; RV32-NEXT: sw a2, 52(sp)
+; RV32-NEXT: sw t4, 56(sp)
+; RV32-NEXT: li t0, 64
+; RV32-NEXT: addi t3, sp, 48
+; RV32-NEXT: neg s1, a7
+; RV32-NEXT: seqz t6, a1
+; RV32-NEXT: sub a7, t0, a7
+; RV32-NEXT: add t5, t5, t6
+; RV32-NEXT: andi t0, a7, 31
+; RV32-NEXT: srli a7, a7, 3
+; RV32-NEXT: or t6, a1, t5
+; RV32-NEXT: xori s2, t0, 31
+; RV32-NEXT: andi a7, a7, 12
+; RV32-NEXT: seqz t0, t6
+; RV32-NEXT: sub s3, t3, a7
+; RV32-NEXT: add a6, a6, t0
+; RV32-NEXT: lw t3, 0(s3)
+; RV32-NEXT: lw s4, 4(s3)
+; RV32-NEXT: andi a7, a6, 1
+; RV32-NEXT: or t6, t6, a7
+; RV32-NEXT: srli a6, t3, 1
+; RV32-NEXT: sll t0, s4, s1
+; RV32-NEXT: srl a6, a6, s2
+; RV32-NEXT: or t0, t0, a6
+; RV32-NEXT: sll a6, t3, s1
+; RV32-NEXT: li t3, 0
+; RV32-NEXT: beqz t6, .LBB1_28
+; RV32-NEXT: # %bb.23: # %udiv-preheader
+; RV32-NEXT: li t6, 0
+; RV32-NEXT: li s0, 0
+; RV32-NEXT: srli s4, s4, 1
+; RV32-NEXT: lw s3, 8(s3)
+; RV32-NEXT: sw zero, 16(sp)
+; RV32-NEXT: sw zero, 20(sp)
+; RV32-NEXT: sw zero, 24(sp)
+; RV32-NEXT: sw zero, 28(sp)
+; RV32-NEXT: sw a5, 0(sp)
+; RV32-NEXT: sw a2, 4(sp)
+; RV32-NEXT: sw t4, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: srli a2, a1, 3
+; RV32-NEXT: srl a5, s4, s2
+; RV32-NEXT: mv t4, sp
+; RV32-NEXT: snez t2, t2
+; RV32-NEXT: andi a2, a2, 12
+; RV32-NEXT: add t1, t1, t2
+; RV32-NEXT: add a2, t4, a2
+; RV32-NEXT: lw t2, 0(a2)
+; RV32-NEXT: lw t4, 4(a2)
+; RV32-NEXT: lw a2, 8(a2)
+; RV32-NEXT: sll s1, s3, s1
+; RV32-NEXT: andi s2, a1, 31
+; RV32-NEXT: xori s2, s2, 31
+; RV32-NEXT: or s3, s1, a5
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: slli a5, t4, 1
+; RV32-NEXT: sll a2, a2, s2
+; RV32-NEXT: sll s2, a5, s2
+; RV32-NEXT: srl s1, t4, a1
+; RV32-NEXT: or s1, s1, a2
+; RV32-NEXT: seqz a2, a3
+; RV32-NEXT: sub a2, a4, a2
+; RV32-NEXT: addi a5, t1, 1
+; RV32-NEXT: andi a5, a5, 1
+; RV32-NEXT: andi s3, s3, 1
+; RV32-NEXT: srl t1, t2, a1
+; RV32-NEXT: or s2, t1, s2
+; RV32-NEXT: addi t1, a3, -1
+; RV32-NEXT: j .LBB1_26
+; RV32-NEXT: .LBB1_24: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB1_26 Depth=1
+; RV32-NEXT: sltu t2, a2, s4
+; RV32-NEXT: .LBB1_25: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB1_26 Depth=1
+; RV32-NEXT: srli s1, s1, 31
+; RV32-NEXT: sub t4, a5, s1
+; RV32-NEXT: sub t2, t4, t2
+; RV32-NEXT: slli t2, t2, 31
+; RV32-NEXT: srai s1, t2, 31
+; RV32-NEXT: and s3, s1, a4
+; RV32-NEXT: li t2, 0
+; RV32-NEXT: li t4, 0
+; RV32-NEXT: srli s5, a6, 31
+; RV32-NEXT: sub s4, s4, s3
+; RV32-NEXT: slli s3, t0, 1
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli t0, t0, 31
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: or a6, t3, a6
+; RV32-NEXT: seqz t3, a1
+; RV32-NEXT: or s0, s0, t0
+; RV32-NEXT: or s5, a1, t5
+; RV32-NEXT: sub t5, t5, t3
+; RV32-NEXT: and s6, s1, a3
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: andi t3, s1, 1
+; RV32-NEXT: or t0, t6, s3
+; RV32-NEXT: sltu t6, s2, s6
+; RV32-NEXT: snez s5, s5
+; RV32-NEXT: andi s3, s0, 1
+; RV32-NEXT: sub s1, s4, t6
+; RV32-NEXT: add a7, a7, s5
+; RV32-NEXT: addi a7, a7, 1
+; RV32-NEXT: andi a7, a7, 1
+; RV32-NEXT: or t6, a1, t5
+; RV32-NEXT: or s4, t6, a7
+; RV32-NEXT: sub s2, s2, s6
+; RV32-NEXT: li t6, 0
+; RV32-NEXT: li s0, 0
+; RV32-NEXT: beqz s4, .LBB1_29
+; RV32-NEXT: .LBB1_26: # %udiv-do-while
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: srli t2, s2, 31
+; RV32-NEXT: slli t4, s1, 1
+; RV32-NEXT: slli s2, s2, 1
+; RV32-NEXT: or s4, t4, t2
+; RV32-NEXT: andi t2, s3, 1
+; RV32-NEXT: or s2, s2, t2
+; RV32-NEXT: bne a2, s4, .LBB1_24
+; RV32-NEXT: # %bb.27: # in Loop: Header=BB1_26 Depth=1
+; RV32-NEXT: sltu t2, t1, s2
+; RV32-NEXT: j .LBB1_25
+; RV32-NEXT: .LBB1_28:
+; RV32-NEXT: li t2, 0
+; RV32-NEXT: li t4, 0
+; RV32-NEXT: .LBB1_29: # %udiv-loop-exit
+; RV32-NEXT: srli a2, a6, 31
+; RV32-NEXT: slli a3, t0, 1
+; RV32-NEXT: srli a4, t0, 31
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: or a1, t3, a6
+; RV32-NEXT: or a2, t2, a2
+; RV32-NEXT: or a4, t4, a4
+; RV32-NEXT: or t0, a2, a3
+; RV32-NEXT: andi t3, a4, 1
+; RV32-NEXT: .LBB1_30: # %udiv-end
+; RV32-NEXT: andi a2, t3, 1
+; RV32-NEXT: sw a1, 0(a0)
+; RV32-NEXT: sw t0, 4(a0)
+; RV32-NEXT: sb a2, 8(a0)
+; RV32-NEXT: lw s0, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 84(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 80(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 96
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i65:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: andi a1, a1, 1
+; RV64-NEXT: andi a3, a3, 1
+; RV64-NEXT: call __udivti3
+; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
+ %res = udiv i65 %x, %y
+ ret i65 %res
+}
define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
-; CHECK-LABEL: udiv_i128:
-; CHECK: call __udivti3
+; RV32-LABEL: udiv_i128:
+; RV32: # %bb.0: # %_udiv-special-cases
+; RV32-NEXT: addi sp, sp, -160
+; RV32-NEXT: sw ra, 156(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 152(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 148(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 144(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 140(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 136(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 132(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 128(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: lw s8, 0(a2)
+; RV32-NEXT: lw s9, 4(a2)
+; RV32-NEXT: lw s11, 8(a2)
+; RV32-NEXT: lw ra, 12(a2)
+; RV32-NEXT: lui t4, 349525
+; RV32-NEXT: addi t4, t4, 1365
+; RV32-NEXT: lui t3, 209715
+; RV32-NEXT: addi t3, t3, 819
+; RV32-NEXT: lui t2, 61681
+; RV32-NEXT: addi t2, t2, -241
+; RV32-NEXT: bnez s9, .LBB2_2
+; RV32-NEXT: # %bb.1: # %_udiv-special-cases
+; RV32-NEXT: srli a0, s8, 1
+; RV32-NEXT: or a0, s8, a0
+; RV32-NEXT: srli a3, a0, 2
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 8
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 16
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a3, a0, 1
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: sub a0, a0, a3
+; RV32-NEXT: and a3, a0, t3
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: add a0, a3, a0
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: and a0, a0, t2
+; RV32-NEXT: slli a3, a0, 8
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: slli a3, a0, 16
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: srli a0, a0, 24
+; RV32-NEXT: addi t6, a0, 32
+; RV32-NEXT: j .LBB2_3
+; RV32-NEXT: .LBB2_2:
+; RV32-NEXT: srli a0, s9, 1
+; RV32-NEXT: or a0, s9, a0
+; RV32-NEXT: srli a3, a0, 2
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 8
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 16
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a3, a0, 1
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: sub a0, a0, a3
+; RV32-NEXT: and a3, a0, t3
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: add a0, a3, a0
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: and a0, a0, t2
+; RV32-NEXT: slli a3, a0, 8
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: slli a3, a0, 16
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: srli t6, a0, 24
+; RV32-NEXT: .LBB2_3: # %_udiv-special-cases
+; RV32-NEXT: lw a6, 4(a1)
+; RV32-NEXT: or s0, s11, ra
+; RV32-NEXT: bnez ra, .LBB2_5
+; RV32-NEXT: # %bb.4: # %_udiv-special-cases
+; RV32-NEXT: srli a0, s11, 1
+; RV32-NEXT: or a0, s11, a0
+; RV32-NEXT: srli a3, a0, 2
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 8
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 16
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a3, a0, 1
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: sub a0, a0, a3
+; RV32-NEXT: and a3, a0, t3
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: add a0, a3, a0
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: and a0, a0, t2
+; RV32-NEXT: slli a3, a0, 8
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: slli a3, a0, 16
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: srli a0, a0, 24
+; RV32-NEXT: addi t5, a0, 32
+; RV32-NEXT: j .LBB2_6
+; RV32-NEXT: .LBB2_5:
+; RV32-NEXT: srli a0, ra, 1
+; RV32-NEXT: or a0, ra, a0
+; RV32-NEXT: srli a3, a0, 2
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 8
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 16
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a3, a0, 1
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: sub a0, a0, a3
+; RV32-NEXT: and a3, a0, t3
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: add a0, a3, a0
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: and a0, a0, t2
+; RV32-NEXT: slli a3, a0, 8
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: slli a3, a0, 16
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: srli t5, a0, 24
+; RV32-NEXT: .LBB2_6: # %_udiv-special-cases
+; RV32-NEXT: lw a7, 12(a1)
+; RV32-NEXT: addi a0, t6, 64
+; RV32-NEXT: bnez s0, .LBB2_8
+; RV32-NEXT: # %bb.7: # %_udiv-special-cases
+; RV32-NEXT: mv t5, a0
+; RV32-NEXT: .LBB2_8: # %_udiv-special-cases
+; RV32-NEXT: lw t1, 0(a1)
+; RV32-NEXT: lw t0, 8(a1)
+; RV32-NEXT: snez s3, s0
+; RV32-NEXT: bnez a6, .LBB2_10
+; RV32-NEXT: # %bb.9: # %_udiv-special-cases
+; RV32-NEXT: srli a1, t1, 1
+; RV32-NEXT: or a1, t1, a1
+; RV32-NEXT: srli a3, a1, 2
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: srli a3, a1, 4
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: srli a3, a1, 8
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: srli a3, a1, 16
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: not a1, a1
+; RV32-NEXT: srli a3, a1, 1
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: sub a1, a1, a3
+; RV32-NEXT: and a3, a1, t3
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: and a1, a1, t3
+; RV32-NEXT: add a1, a3, a1
+; RV32-NEXT: srli a3, a1, 4
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: and a1, a1, t2
+; RV32-NEXT: slli a3, a1, 8
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: slli a3, a1, 16
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: srli a1, a1, 24
+; RV32-NEXT: addi a3, a1, 32
+; RV32-NEXT: j .LBB2_11
+; RV32-NEXT: .LBB2_10:
+; RV32-NEXT: srli a1, a6, 1
+; RV32-NEXT: or a1, a6, a1
+; RV32-NEXT: srli a3, a1, 2
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: srli a3, a1, 4
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: srli a3, a1, 8
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: srli a3, a1, 16
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: not a1, a1
+; RV32-NEXT: srli a3, a1, 1
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: sub a1, a1, a3
+; RV32-NEXT: and a3, a1, t3
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: and a1, a1, t3
+; RV32-NEXT: add a1, a3, a1
+; RV32-NEXT: srli a3, a1, 4
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: and a1, a1, t2
+; RV32-NEXT: slli a3, a1, 8
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: slli a3, a1, 16
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: srli a3, a1, 24
+; RV32-NEXT: .LBB2_11: # %_udiv-special-cases
+; RV32-NEXT: or a1, s9, ra
+; RV32-NEXT: or s0, s8, s11
+; RV32-NEXT: or s1, a6, a7
+; RV32-NEXT: or s2, t1, t0
+; RV32-NEXT: sltu t6, a0, t6
+; RV32-NEXT: addi s3, s3, -1
+; RV32-NEXT: addi a0, a3, 64
+; RV32-NEXT: or s4, t0, a7
+; RV32-NEXT: sltu s5, a0, a3
+; RV32-NEXT: snez s6, s4
+; RV32-NEXT: addi s6, s6, -1
+; RV32-NEXT: bnez a7, .LBB2_13
+; RV32-NEXT: # %bb.12: # %_udiv-special-cases
+; RV32-NEXT: srli a3, t0, 1
+; RV32-NEXT: or a3, t0, a3
+; RV32-NEXT: srli a4, a3, 2
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 8
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 16
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: not a3, a3
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: and a4, a4, t4
+; RV32-NEXT: sub a3, a3, a4
+; RV32-NEXT: and a4, a3, t3
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: and a3, a3, t3
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: and a3, a3, t2
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: addi a3, a3, 32
+; RV32-NEXT: j .LBB2_14
+; RV32-NEXT: .LBB2_13:
+; RV32-NEXT: srli a3, a7, 1
+; RV32-NEXT: or a3, a7, a3
+; RV32-NEXT: srli a4, a3, 2
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 8
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 16
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: not a3, a3
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: and a4, a4, t4
+; RV32-NEXT: sub a3, a3, a4
+; RV32-NEXT: and a4, a3, t3
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: and a3, a3, t3
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: and a3, a3, t2
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: .LBB2_14: # %_udiv-special-cases
+; RV32-NEXT: or s0, s0, a1
+; RV32-NEXT: or a5, s2, s1
+; RV32-NEXT: and a1, s3, t6
+; RV32-NEXT: and a4, s6, s5
+; RV32-NEXT: bnez s4, .LBB2_16
+; RV32-NEXT: # %bb.15: # %_udiv-special-cases
+; RV32-NEXT: mv a3, a0
+; RV32-NEXT: .LBB2_16: # %_udiv-special-cases
+; RV32-NEXT: seqz a0, s0
+; RV32-NEXT: seqz a5, a5
+; RV32-NEXT: sltu t2, t5, a3
+; RV32-NEXT: sub t4, a1, a4
+; RV32-NEXT: mv t3, t2
+; RV32-NEXT: beq a1, a4, .LBB2_18
+; RV32-NEXT: # %bb.17: # %_udiv-special-cases
+; RV32-NEXT: sltu t3, a1, a4
+; RV32-NEXT: .LBB2_18: # %_udiv-special-cases
+; RV32-NEXT: sub t2, t4, t2
+; RV32-NEXT: or a0, a0, a5
+; RV32-NEXT: neg t4, t3
+; RV32-NEXT: seqz t6, t3
+; RV32-NEXT: addi t6, t6, -1
+; RV32-NEXT: or a1, t4, t6
+; RV32-NEXT: sub t3, t5, a3
+; RV32-NEXT: beqz a1, .LBB2_20
+; RV32-NEXT: # %bb.19: # %_udiv-special-cases
+; RV32-NEXT: snez a1, a1
+; RV32-NEXT: j .LBB2_21
+; RV32-NEXT: .LBB2_20:
+; RV32-NEXT: snez a1, t2
+; RV32-NEXT: sltiu a3, t3, 128
+; RV32-NEXT: xori a3, a3, 1
+; RV32-NEXT: or a1, a3, a1
+; RV32-NEXT: .LBB2_21: # %_udiv-special-cases
+; RV32-NEXT: or a5, a0, a1
+; RV32-NEXT: addi a3, a5, -1
+; RV32-NEXT: and a0, a3, a7
+; RV32-NEXT: and a1, a3, t0
+; RV32-NEXT: and a4, a3, a6
+; RV32-NEXT: and a3, a3, t1
+; RV32-NEXT: bnez a5, .LBB2_26
+; RV32-NEXT: # %bb.22: # %_udiv-special-cases
+; RV32-NEXT: xori a5, t3, 127
+; RV32-NEXT: or a5, a5, t4
+; RV32-NEXT: or t5, t2, t6
+; RV32-NEXT: or a5, a5, t5
+; RV32-NEXT: beqz a5, .LBB2_26
+; RV32-NEXT: # %bb.23: # %udiv-bb1
+; RV32-NEXT: sw s7, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi a1, t3, 1
+; RV32-NEXT: sw zero, 72(sp)
+; RV32-NEXT: sw zero, 76(sp)
+; RV32-NEXT: sw zero, 80(sp)
+; RV32-NEXT: sw zero, 84(sp)
+; RV32-NEXT: sw t1, 88(sp)
+; RV32-NEXT: sw a6, 92(sp)
+; RV32-NEXT: sw t0, 96(sp)
+; RV32-NEXT: sw a7, 100(sp)
+; RV32-NEXT: li a0, 127
+; RV32-NEXT: addi a2, sp, 88
+; RV32-NEXT: seqz a3, a1
+; RV32-NEXT: sub a0, a0, t3
+; RV32-NEXT: add t2, t2, a3
+; RV32-NEXT: andi a3, a0, 31
+; RV32-NEXT: srli a0, a0, 3
+; RV32-NEXT: or a4, a1, t2
+; RV32-NEXT: xori a3, a3, 31
+; RV32-NEXT: andi a0, a0, 12
+; RV32-NEXT: seqz t5, a4
+; RV32-NEXT: sub a2, a2, a0
+; RV32-NEXT: add t5, t4, t5
+; RV32-NEXT: lw a0, 0(a2)
+; RV32-NEXT: lw a4, 4(a2)
+; RV32-NEXT: lw a5, 8(a2)
+; RV32-NEXT: lw a2, 12(a2)
+; RV32-NEXT: sltu t4, t5, t4
+; RV32-NEXT: or s0, a1, t5
+; RV32-NEXT: add t4, t6, t4
+; RV32-NEXT: or t6, t2, t4
+; RV32-NEXT: or s0, s0, t6
+; RV32-NEXT: srli t6, a5, 1
+; RV32-NEXT: srli s1, a4, 1
+; RV32-NEXT: srli s2, a0, 1
+; RV32-NEXT: srl t6, t6, a3
+; RV32-NEXT: srl s1, s1, a3
+; RV32-NEXT: srl a3, s2, a3
+; RV32-NEXT: not t3, t3
+; RV32-NEXT: sll a2, a2, t3
+; RV32-NEXT: or s2, a2, t6
+; RV32-NEXT: sll a2, a5, t3
+; RV32-NEXT: sll a4, a4, t3
+; RV32-NEXT: or s1, a2, s1
+; RV32-NEXT: or t6, a4, a3
+; RV32-NEXT: sll t3, a0, t3
+; RV32-NEXT: bnez s0, .LBB2_27
+; RV32-NEXT: # %bb.24:
+; RV32-NEXT: li s6, 0
+; RV32-NEXT: li s7, 0
+; RV32-NEXT: li s8, 0
+; RV32-NEXT: .LBB2_25: # %udiv-loop-exit
+; RV32-NEXT: srli a0, s1, 31
+; RV32-NEXT: slli s2, s2, 1
+; RV32-NEXT: or a0, s2, a0
+; RV32-NEXT: srli a1, t6, 31
+; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: or a1, s1, a1
+; RV32-NEXT: srli a2, t3, 31
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: slli a3, t3, 1
+; RV32-NEXT: or a3, s0, a3
+; RV32-NEXT: or a2, s6, a2
+; RV32-NEXT: or a4, a2, t6
+; RV32-NEXT: or a1, s7, a1
+; RV32-NEXT: or a0, s8, a0
+; RV32-NEXT: lw s7, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: .LBB2_26: # %udiv-end
+; RV32-NEXT: sw a3, 0(s7)
+; RV32-NEXT: sw a4, 4(s7)
+; RV32-NEXT: sw a1, 8(s7)
+; RV32-NEXT: sw a0, 12(s7)
+; RV32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 148(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 144(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 140(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 136(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 132(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 128(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 112(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 108(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 160
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB2_27: # %udiv-preheader
+; RV32-NEXT: li s0, 0
+; RV32-NEXT: li s5, 0
+; RV32-NEXT: li s3, 0
+; RV32-NEXT: li s4, 0
+; RV32-NEXT: sw zero, 56(sp)
+; RV32-NEXT: sw zero, 60(sp)
+; RV32-NEXT: sw zero, 64(sp)
+; RV32-NEXT: sw zero, 68(sp)
+; RV32-NEXT: sw t1, 40(sp)
+; RV32-NEXT: sw a6, 44(sp)
+; RV32-NEXT: sw t0, 48(sp)
+; RV32-NEXT: sw a7, 52(sp)
+; RV32-NEXT: srli a0, a1, 3
+; RV32-NEXT: addi a2, sp, 40
+; RV32-NEXT: andi a0, a0, 12
+; RV32-NEXT: add a0, a2, a0
+; RV32-NEXT: lw a2, 4(a0)
+; RV32-NEXT: lw a3, 8(a0)
+; RV32-NEXT: lw a4, 12(a0)
+; RV32-NEXT: lw a0, 0(a0)
+; RV32-NEXT: andi a5, a1, 31
+; RV32-NEXT: xori a5, a5, 31
+; RV32-NEXT: slli a6, a4, 1
+; RV32-NEXT: slli a7, a3, 1
+; RV32-NEXT: slli t0, a2, 1
+; RV32-NEXT: sll a6, a6, a5
+; RV32-NEXT: sll a7, a7, a5
+; RV32-NEXT: sll a5, t0, a5
+; RV32-NEXT: seqz t0, s8
+; RV32-NEXT: srl a3, a3, a1
+; RV32-NEXT: or s10, a3, a6
+; RV32-NEXT: or a3, s8, s9
+; RV32-NEXT: sw s9, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT: sub a6, s9, t0
+; RV32-NEXT: seqz a3, a3
+; RV32-NEXT: srl a2, a2, a1
+; RV32-NEXT: or s9, a2, a7
+; RV32-NEXT: sub a7, s11, a3
+; RV32-NEXT: sw s11, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sltu a2, s11, a3
+; RV32-NEXT: sw ra, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: sub a2, ra, a2
+; RV32-NEXT: sw a2, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: srl a0, a0, a1
+; RV32-NEXT: srl ra, a4, a1
+; RV32-NEXT: or t1, a0, a5
+; RV32-NEXT: sw s8, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s8, s8, -1
+; RV32-NEXT: sw s8, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: li s7, 0
+; RV32-NEXT: li s8, 0
+; RV32-NEXT: j .LBB2_29
+; RV32-NEXT: .LBB2_28: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT: li s6, 0
+; RV32-NEXT: sub a0, a0, a5
+; RV32-NEXT: srli a5, s1, 31
+; RV32-NEXT: slli s2, s2, 1
+; RV32-NEXT: or a5, s2, a5
+; RV32-NEXT: srli s2, t6, 31
+; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: or s1, s1, s2
+; RV32-NEXT: srli s2, t3, 31
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: slli t3, t3, 1
+; RV32-NEXT: or t6, t6, s2
+; RV32-NEXT: lw a2, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: and s2, s10, a2
+; RV32-NEXT: or t3, s0, t3
+; RV32-NEXT: sub a2, a3, s2
+; RV32-NEXT: sltu a3, a3, s2
+; RV32-NEXT: lw t0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: and s0, s10, t0
+; RV32-NEXT: sub t0, s9, s0
+; RV32-NEXT: or s2, a1, t2
+; RV32-NEXT: sub s9, a0, a4
+; RV32-NEXT: seqz a0, a1
+; RV32-NEXT: sub t2, t2, a0
+; RV32-NEXT: or t6, s5, t6
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: andi s0, s10, 1
+; RV32-NEXT: seqz a0, s2
+; RV32-NEXT: or s1, s3, s1
+; RV32-NEXT: or s2, s4, a5
+; RV32-NEXT: sub s10, a2, ra
+; RV32-NEXT: sltu a2, a2, ra
+; RV32-NEXT: sub a3, t0, a3
+; RV32-NEXT: sltu a4, t5, a0
+; RV32-NEXT: sub t5, t5, a0
+; RV32-NEXT: sub ra, a3, a2
+; RV32-NEXT: sub t4, t4, a4
+; RV32-NEXT: or a0, t2, t4
+; RV32-NEXT: or a2, a1, t5
+; RV32-NEXT: or a0, a2, a0
+; RV32-NEXT: sub t1, s11, t1
+; RV32-NEXT: li s5, 0
+; RV32-NEXT: li s3, 0
+; RV32-NEXT: li s4, 0
+; RV32-NEXT: beqz a0, .LBB2_25
+; RV32-NEXT: .LBB2_29: # %udiv-do-while
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: srli a0, t1, 31
+; RV32-NEXT: slli a3, s9, 1
+; RV32-NEXT: slli t1, t1, 1
+; RV32-NEXT: or a0, a3, a0
+; RV32-NEXT: srli a3, s2, 31
+; RV32-NEXT: or s11, t1, a3
+; RV32-NEXT: beq a6, a0, .LBB2_31
+; RV32-NEXT: # %bb.30: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT: sltu a4, a6, a0
+; RV32-NEXT: j .LBB2_32
+; RV32-NEXT: .LBB2_31: # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT: lw a2, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a4, a2, s11
+; RV32-NEXT: .LBB2_32: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT: lw a2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: srli a3, s10, 31
+; RV32-NEXT: slli ra, ra, 1
+; RV32-NEXT: srli a5, s9, 31
+; RV32-NEXT: slli s10, s10, 1
+; RV32-NEXT: or s9, ra, a3
+; RV32-NEXT: or a3, s10, a5
+; RV32-NEXT: sub a5, a7, a3
+; RV32-NEXT: sltu t1, a7, a3
+; RV32-NEXT: lw t0, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: sub s6, t0, s9
+; RV32-NEXT: sltu a4, a5, a4
+; RV32-NEXT: sub a5, s6, t1
+; RV32-NEXT: sub a5, a5, a4
+; RV32-NEXT: srai s10, a5, 31
+; RV32-NEXT: and t1, s10, a2
+; RV32-NEXT: lw a2, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: and a5, s10, a2
+; RV32-NEXT: sltu a4, s11, t1
+; RV32-NEXT: mv ra, a4
+; RV32-NEXT: beq a0, a5, .LBB2_28
+; RV32-NEXT: # %bb.33: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT: sltu ra, a0, a5
+; RV32-NEXT: j .LBB2_28
+;
+; RV64-LABEL: udiv_i128:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: call __udivti3
+; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
%res = udiv i128 %x, %y
ret i128 %res
}
define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
-; CHECK-LABEL: udiv_i129:
-; CHECK-NOT: call{{.*}}div
+; RV32-LABEL: udiv_i129:
+; RV32: # %bb.0: # %_udiv-special-cases
+; RV32-NEXT: addi sp, sp, -240
+; RV32-NEXT: sw ra, 236(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 232(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 228(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 224(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 220(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 216(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 212(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 208(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 204(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 200(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 196(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 192(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 188(sp) # 4-byte Folded Spill
+; RV32-NEXT: mv ra, a0
+; RV32-NEXT: lw t2, 16(a2)
+; RV32-NEXT: lw a4, 0(a2)
+; RV32-NEXT: lw a5, 4(a2)
+; RV32-NEXT: lw a6, 8(a2)
+; RV32-NEXT: lw a0, 12(a2)
+; RV32-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: lui a0, 349525
+; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi t5, a0, 1365
+; RV32-NEXT: addi t4, a2, 819
+; RV32-NEXT: addi t3, a3, -241
+; RV32-NEXT: sw a6, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: slli a0, a6, 31
+; RV32-NEXT: srli a2, a5, 1
+; RV32-NEXT: sw a5, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: slli a3, a5, 31
+; RV32-NEXT: or a0, a2, a0
+; RV32-NEXT: sw a4, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT: srli a2, a4, 1
+; RV32-NEXT: or a2, a2, a3
+; RV32-NEXT: bnez a0, .LBB3_2
+; RV32-NEXT: # %bb.1: # %_udiv-special-cases
+; RV32-NEXT: srli a3, a2, 1
+; RV32-NEXT: or a3, a2, a3
+; RV32-NEXT: srli a4, a3, 2
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 8
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 16
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: not a3, a3
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: and a4, a4, t5
+; RV32-NEXT: sub a3, a3, a4
+; RV32-NEXT: and a4, a3, t4
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: and a3, a3, t3
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: addi a6, a3, 32
+; RV32-NEXT: j .LBB3_3
+; RV32-NEXT: .LBB3_2:
+; RV32-NEXT: srli a3, a0, 1
+; RV32-NEXT: or a3, a0, a3
+; RV32-NEXT: srli a4, a3, 2
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 8
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 16
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: not a3, a3
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: and a4, a4, t5
+; RV32-NEXT: sub a3, a3, a4
+; RV32-NEXT: and a4, a3, t4
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: and a3, a3, t3
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: srli a6, a3, 24
+; RV32-NEXT: .LBB3_3: # %_udiv-special-cases
+; RV32-NEXT: lw a7, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: srli a3, a7, 1
+; RV32-NEXT: slli a5, t2, 31
+; RV32-NEXT: slli a7, a7, 31
+; RV32-NEXT: lw a4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: srli t0, a4, 1
+; RV32-NEXT: lw a4, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: slli a4, a4, 31
+; RV32-NEXT: li s2, 64
+; RV32-NEXT: bnez a4, .LBB3_5
+; RV32-NEXT: # %bb.4: # %_udiv-special-cases
+; RV32-NEXT: li t6, 64
+; RV32-NEXT: j .LBB3_6
+; RV32-NEXT: .LBB3_5:
+; RV32-NEXT: srli t1, a4, 1
+; RV32-NEXT: or t1, a4, t1
+; RV32-NEXT: srli t6, t1, 2
+; RV32-NEXT: or t1, t1, t6
+; RV32-NEXT: srli t6, t1, 4
+; RV32-NEXT: or t1, t1, t6
+; RV32-NEXT: srli t6, t1, 8
+; RV32-NEXT: or t1, t1, t6
+; RV32-NEXT: srli t6, t1, 16
+; RV32-NEXT: or t1, t1, t6
+; RV32-NEXT: not t1, t1
+; RV32-NEXT: srli t6, t1, 1
+; RV32-NEXT: and t6, t6, t5
+; RV32-NEXT: sub t1, t1, t6
+; RV32-NEXT: and t6, t1, t4
+; RV32-NEXT: srli t1, t1, 2
+; RV32-NEXT: and t1, t1, t4
+; RV32-NEXT: add t1, t6, t1
+; RV32-NEXT: srli t6, t1, 4
+; RV32-NEXT: add t1, t1, t6
+; RV32-NEXT: and t1, t1, t3
+; RV32-NEXT: slli t6, t1, 8
+; RV32-NEXT: add t1, t1, t6
+; RV32-NEXT: slli t6, t1, 16
+; RV32-NEXT: add t1, t1, t6
+; RV32-NEXT: srli t6, t1, 24
+; RV32-NEXT: .LBB3_6: # %_udiv-special-cases
+; RV32-NEXT: or t1, a5, a3
+; RV32-NEXT: or a7, t0, a7
+; RV32-NEXT: bnez a4, .LBB3_8
+; RV32-NEXT: # %bb.7: # %_udiv-special-cases
+; RV32-NEXT: li t6, 128
+; RV32-NEXT: .LBB3_8: # %_udiv-special-cases
+; RV32-NEXT: or a5, a7, t1
+; RV32-NEXT: addi a4, a6, 64
+; RV32-NEXT: addi a3, t6, 128
+; RV32-NEXT: or a0, a0, t1
+; RV32-NEXT: or a2, a2, a7
+; RV32-NEXT: or s3, a2, a0
+; RV32-NEXT: sltu s0, a3, t6
+; RV32-NEXT: bnez s3, .LBB3_11
+; RV32-NEXT: # %bb.9: # %_udiv-special-cases
+; RV32-NEXT: mv t6, s0
+; RV32-NEXT: beqz t1, .LBB3_12
+; RV32-NEXT: .LBB3_10:
+; RV32-NEXT: srli a0, t1, 1
+; RV32-NEXT: or a0, t1, a0
+; RV32-NEXT: srli a2, a0, 2
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 4
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 8
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 16
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a2, a0, 1
+; RV32-NEXT: and a2, a2, t5
+; RV32-NEXT: sub a0, a0, a2
+; RV32-NEXT: and a2, a0, t4
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t4
+; RV32-NEXT: add a0, a2, a0
+; RV32-NEXT: srli a2, a0, 4
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: slli a2, a0, 8
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: slli a2, a0, 16
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: srli s1, a0, 24
+; RV32-NEXT: beqz a5, .LBB3_13
+; RV32-NEXT: j .LBB3_14
+; RV32-NEXT: .LBB3_11:
+; RV32-NEXT: snez a0, a5
+; RV32-NEXT: sltu a2, a4, a6
+; RV32-NEXT: addi a0, a0, -1
+; RV32-NEXT: and t6, a0, a2
+; RV32-NEXT: bnez t1, .LBB3_10
+; RV32-NEXT: .LBB3_12: # %_udiv-special-cases
+; RV32-NEXT: srli a0, a7, 1
+; RV32-NEXT: or a0, a7, a0
+; RV32-NEXT: srli a2, a0, 2
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 4
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 8
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 16
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a2, a0, 1
+; RV32-NEXT: and a2, a2, t5
+; RV32-NEXT: sub a0, a0, a2
+; RV32-NEXT: and a2, a0, t4
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t4
+; RV32-NEXT: add a0, a2, a0
+; RV32-NEXT: srli a2, a0, 4
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: slli a2, a0, 8
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: slli a2, a0, 16
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: srli a0, a0, 24
+; RV32-NEXT: addi s1, a0, 32
+; RV32-NEXT: bnez a5, .LBB3_14
+; RV32-NEXT: .LBB3_13: # %_udiv-special-cases
+; RV32-NEXT: mv s1, a4
+; RV32-NEXT: .LBB3_14: # %_udiv-special-cases
+; RV32-NEXT: lw a7, 0(a1)
+; RV32-NEXT: lw t0, 4(a1)
+; RV32-NEXT: lw a6, 8(a1)
+; RV32-NEXT: bnez s3, .LBB3_16
+; RV32-NEXT: # %bb.15: # %_udiv-special-cases
+; RV32-NEXT: mv s1, a3
+; RV32-NEXT: .LBB3_16: # %_udiv-special-cases
+; RV32-NEXT: lw t1, 12(a1)
+; RV32-NEXT: lw a1, 16(a1)
+; RV32-NEXT: slli a0, a6, 31
+; RV32-NEXT: srli a2, t0, 1
+; RV32-NEXT: or a0, a2, a0
+; RV32-NEXT: slli a2, t0, 31
+; RV32-NEXT: srli a3, a7, 1
+; RV32-NEXT: or a2, a3, a2
+; RV32-NEXT: bnez a0, .LBB3_18
+; RV32-NEXT: # %bb.17: # %_udiv-special-cases
+; RV32-NEXT: srli a3, a2, 1
+; RV32-NEXT: or a3, a2, a3
+; RV32-NEXT: srli a4, a3, 2
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 8
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 16
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: not a3, a3
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: and a4, a4, t5
+; RV32-NEXT: sub a3, a3, a4
+; RV32-NEXT: and a4, a3, t4
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: and a3, a3, t3
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: addi s5, a3, 32
+; RV32-NEXT: j .LBB3_19
+; RV32-NEXT: .LBB3_18:
+; RV32-NEXT: srli a3, a0, 1
+; RV32-NEXT: or a3, a0, a3
+; RV32-NEXT: srli a4, a3, 2
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 8
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 16
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: not a3, a3
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: and a4, a4, t5
+; RV32-NEXT: sub a3, a3, a4
+; RV32-NEXT: and a4, a3, t4
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: and a3, a3, t3
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: srli s5, a3, 24
+; RV32-NEXT: .LBB3_19: # %_udiv-special-cases
+; RV32-NEXT: srli a3, t1, 1
+; RV32-NEXT: slli a4, a1, 31
+; RV32-NEXT: slli a5, t1, 31
+; RV32-NEXT: slli s4, a7, 31
+; RV32-NEXT: srli s6, a6, 1
+; RV32-NEXT: beqz s4, .LBB3_21
+; RV32-NEXT: # %bb.20:
+; RV32-NEXT: srli s2, s4, 1
+; RV32-NEXT: or s2, s4, s2
+; RV32-NEXT: srli s7, s2, 2
+; RV32-NEXT: or s2, s2, s7
+; RV32-NEXT: srli s7, s2, 4
+; RV32-NEXT: or s2, s2, s7
+; RV32-NEXT: srli s7, s2, 8
+; RV32-NEXT: or s2, s2, s7
+; RV32-NEXT: srli s7, s2, 16
+; RV32-NEXT: or s2, s2, s7
+; RV32-NEXT: not s2, s2
+; RV32-NEXT: srli s7, s2, 1
+; RV32-NEXT: and s7, s7, t5
+; RV32-NEXT: sub s2, s2, s7
+; RV32-NEXT: and s7, s2, t4
+; RV32-NEXT: srli s2, s2, 2
+; RV32-NEXT: and s2, s2, t4
+; RV32-NEXT: add s2, s7, s2
+; RV32-NEXT: srli s7, s2, 4
+; RV32-NEXT: add s2, s2, s7
+; RV32-NEXT: and s2, s2, t3
+; RV32-NEXT: slli s7, s2, 8
+; RV32-NEXT: add s2, s2, s7
+; RV32-NEXT: slli s7, s2, 16
+; RV32-NEXT: add s2, s2, s7
+; RV32-NEXT: srli s2, s2, 24
+; RV32-NEXT: .LBB3_21: # %_udiv-special-cases
+; RV32-NEXT: or s7, a4, a3
+; RV32-NEXT: or s6, s6, a5
+; RV32-NEXT: bnez s4, .LBB3_23
+; RV32-NEXT: # %bb.22: # %_udiv-special-cases
+; RV32-NEXT: li s2, 128
+; RV32-NEXT: .LBB3_23: # %_udiv-special-cases
+; RV32-NEXT: or s4, s6, s7
+; RV32-NEXT: addi a5, s5, 64
+; RV32-NEXT: addi a3, s2, 128
+; RV32-NEXT: or a0, a0, s7
+; RV32-NEXT: or a4, a2, s6
+; RV32-NEXT: or a4, a4, a0
+; RV32-NEXT: sltu a0, a3, s2
+; RV32-NEXT: bnez a4, .LBB3_26
+; RV32-NEXT: # %bb.24: # %_udiv-special-cases
+; RV32-NEXT: mv a2, a0
+; RV32-NEXT: snez s2, s3
+; RV32-NEXT: beqz s7, .LBB3_27
+; RV32-NEXT: .LBB3_25:
+; RV32-NEXT: srli s3, s7, 1
+; RV32-NEXT: or s3, s7, s3
+; RV32-NEXT: srli s5, s3, 2
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli s5, s3, 4
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli s5, s3, 8
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli s5, s3, 16
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: not s3, s3
+; RV32-NEXT: srli s5, s3, 1
+; RV32-NEXT: and t5, s5, t5
+; RV32-NEXT: sub t5, s3, t5
+; RV32-NEXT: and s3, t5, t4
+; RV32-NEXT: srli t5, t5, 2
+; RV32-NEXT: and t4, t5, t4
+; RV32-NEXT: add t4, s3, t4
+; RV32-NEXT: srli t5, t4, 4
+; RV32-NEXT: add t4, t4, t5
+; RV32-NEXT: and t3, t4, t3
+; RV32-NEXT: slli t4, t3, 8
+; RV32-NEXT: add t3, t3, t4
+; RV32-NEXT: slli t4, t3, 16
+; RV32-NEXT: add t3, t3, t4
+; RV32-NEXT: srli t3, t3, 24
+; RV32-NEXT: j .LBB3_28
+; RV32-NEXT: .LBB3_26:
+; RV32-NEXT: snez a2, s4
+; RV32-NEXT: sltu s2, a5, s5
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a2, a2, s2
+; RV32-NEXT: snez s2, s3
+; RV32-NEXT: bnez s7, .LBB3_25
+; RV32-NEXT: .LBB3_27: # %_udiv-special-cases
+; RV32-NEXT: srli s3, s6, 1
+; RV32-NEXT: or s3, s6, s3
+; RV32-NEXT: srli s5, s3, 2
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli s5, s3, 4
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli s5, s3, 8
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli s5, s3, 16
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: not s3, s3
+; RV32-NEXT: srli s5, s3, 1
+; RV32-NEXT: and t5, s5, t5
+; RV32-NEXT: sub t5, s3, t5
+; RV32-NEXT: and s3, t5, t4
+; RV32-NEXT: srli t5, t5, 2
+; RV32-NEXT: and t4, t5, t4
+; RV32-NEXT: add t4, s3, t4
+; RV32-NEXT: srli t5, t4, 4
+; RV32-NEXT: add t4, t4, t5
+; RV32-NEXT: and t3, t4, t3
+; RV32-NEXT: slli t4, t3, 8
+; RV32-NEXT: add t3, t3, t4
+; RV32-NEXT: slli t4, t3, 16
+; RV32-NEXT: add t3, t3, t4
+; RV32-NEXT: srli t3, t3, 24
+; RV32-NEXT: addi t3, t3, 32
+; RV32-NEXT: .LBB3_28: # %_udiv-special-cases
+; RV32-NEXT: xori t4, s0, 1
+; RV32-NEXT: addi s2, s2, -1
+; RV32-NEXT: bnez s4, .LBB3_30
+; RV32-NEXT: # %bb.29: # %_udiv-special-cases
+; RV32-NEXT: mv t3, a5
+; RV32-NEXT: .LBB3_30: # %_udiv-special-cases
+; RV32-NEXT: andi s11, a1, 1
+; RV32-NEXT: andi s8, t2, 1
+; RV32-NEXT: lw a1, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a5, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: or s9, a1, a5
+; RV32-NEXT: or t2, a7, a6
+; RV32-NEXT: neg a1, t4
+; RV32-NEXT: and s0, s2, s0
+; RV32-NEXT: bnez a4, .LBB3_32
+; RV32-NEXT: # %bb.31: # %_udiv-special-cases
+; RV32-NEXT: mv t3, a3
+; RV32-NEXT: .LBB3_32: # %_udiv-special-cases
+; RV32-NEXT: lw a3, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: or s10, a3, a5
+; RV32-NEXT: or a5, s9, s8
+; RV32-NEXT: or t4, t0, t1
+; RV32-NEXT: or t5, t2, s11
+; RV32-NEXT: and a1, s0, a1
+; RV32-NEXT: xori a3, a0, 1
+; RV32-NEXT: snez a4, a4
+; RV32-NEXT: neg a3, a3
+; RV32-NEXT: addi a4, a4, -1
+; RV32-NEXT: and a0, a4, a0
+; RV32-NEXT: sltu a4, s1, t3
+; RV32-NEXT: and t2, a0, a3
+; RV32-NEXT: mv a3, a4
+; RV32-NEXT: beq t6, a2, .LBB3_34
+; RV32-NEXT: # %bb.33: # %_udiv-special-cases
+; RV32-NEXT: sltu a3, t6, a2
+; RV32-NEXT: .LBB3_34: # %_udiv-special-cases
+; RV32-NEXT: or a0, a5, s10
+; RV32-NEXT: or t5, t5, t4
+; RV32-NEXT: sltu t4, a1, t2
+; RV32-NEXT: mv s0, a3
+; RV32-NEXT: beq a1, t2, .LBB3_36
+; RV32-NEXT: # %bb.35: # %_udiv-special-cases
+; RV32-NEXT: mv s0, t4
+; RV32-NEXT: .LBB3_36: # %_udiv-special-cases
+; RV32-NEXT: seqz a5, a0
+; RV32-NEXT: seqz t5, t5
+; RV32-NEXT: andi a0, s0, 1
+; RV32-NEXT: sub a2, t6, a2
+; RV32-NEXT: sub a1, a1, t2
+; RV32-NEXT: sub t2, a2, a4
+; RV32-NEXT: sltu a2, a1, a3
+; RV32-NEXT: add a2, t4, a2
+; RV32-NEXT: neg t4, a2
+; RV32-NEXT: sub a4, a1, a3
+; RV32-NEXT: or a1, a4, t4
+; RV32-NEXT: sub a3, s1, t3
+; RV32-NEXT: beqz a1, .LBB3_38
+; RV32-NEXT: # %bb.37: # %_udiv-special-cases
+; RV32-NEXT: snez a1, a1
+; RV32-NEXT: or a2, a5, t5
+; RV32-NEXT: bnez a0, .LBB3_39
+; RV32-NEXT: j .LBB3_40
+; RV32-NEXT: .LBB3_38:
+; RV32-NEXT: snez a1, t2
+; RV32-NEXT: sltiu a2, a3, 129
+; RV32-NEXT: xori a2, a2, 1
+; RV32-NEXT: or a1, a2, a1
+; RV32-NEXT: or a2, a5, t5
+; RV32-NEXT: beqz a0, .LBB3_40
+; RV32-NEXT: .LBB3_39: # %_udiv-special-cases
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: .LBB3_40: # %_udiv-special-cases
+; RV32-NEXT: or t6, a2, a1
+; RV32-NEXT: addi a1, t6, -1
+; RV32-NEXT: and a2, s11, a1
+; RV32-NEXT: and a5, a1, t1
+; RV32-NEXT: and t3, a1, a6
+; RV32-NEXT: and t5, a1, t0
+; RV32-NEXT: and a1, a1, a7
+; RV32-NEXT: bnez t6, .LBB3_57
+; RV32-NEXT: # %bb.41: # %_udiv-special-cases
+; RV32-NEXT: or t6, t2, t4
+; RV32-NEXT: xori s0, a3, 128
+; RV32-NEXT: or s0, s0, a0
+; RV32-NEXT: or s0, s0, a4
+; RV32-NEXT: or t6, s0, t6
+; RV32-NEXT: beqz t6, .LBB3_57
+; RV32-NEXT: # %bb.42: # %udiv-bb1
+; RV32-NEXT: sw ra, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi a1, a3, 1
+; RV32-NEXT: sw zero, 136(sp)
+; RV32-NEXT: sw zero, 140(sp)
+; RV32-NEXT: sw zero, 144(sp)
+; RV32-NEXT: sw zero, 148(sp)
+; RV32-NEXT: sw zero, 120(sp)
+; RV32-NEXT: sw zero, 124(sp)
+; RV32-NEXT: sw zero, 128(sp)
+; RV32-NEXT: sw zero, 132(sp)
+; RV32-NEXT: sw a7, 152(sp)
+; RV32-NEXT: sw t0, 156(sp)
+; RV32-NEXT: sw a6, 160(sp)
+; RV32-NEXT: sw t1, 164(sp)
+; RV32-NEXT: sw s11, 168(sp)
+; RV32-NEXT: li a5, 128
+; RV32-NEXT: addi t3, sp, 152
+; RV32-NEXT: neg a2, a3
+; RV32-NEXT: seqz t5, a1
+; RV32-NEXT: sub a5, a5, a3
+; RV32-NEXT: add t2, t2, t5
+; RV32-NEXT: andi a3, a5, 31
+; RV32-NEXT: srli t5, a5, 3
+; RV32-NEXT: or t6, a1, t2
+; RV32-NEXT: xori a5, a3, 31
+; RV32-NEXT: andi a3, t5, 28
+; RV32-NEXT: seqz t6, t6
+; RV32-NEXT: sub ra, t3, a3
+; RV32-NEXT: add t6, a4, t6
+; RV32-NEXT: lw t3, 0(ra)
+; RV32-NEXT: lw s0, 4(ra)
+; RV32-NEXT: lw s1, 8(ra)
+; RV32-NEXT: lw a3, 12(ra)
+; RV32-NEXT: sltu a4, t6, a4
+; RV32-NEXT: or t5, a1, t6
+; RV32-NEXT: add t4, t4, a4
+; RV32-NEXT: or a4, t2, t4
+; RV32-NEXT: or a4, t5, a4
+; RV32-NEXT: srli t5, s1, 1
+; RV32-NEXT: seqz s2, a4
+; RV32-NEXT: add a0, a0, s2
+; RV32-NEXT: sll s2, a3, a2
+; RV32-NEXT: srl t5, t5, a5
+; RV32-NEXT: or t5, s2, t5
+; RV32-NEXT: srli s2, s0, 1
+; RV32-NEXT: sll s1, s1, a2
+; RV32-NEXT: srl s2, s2, a5
+; RV32-NEXT: or s2, s1, s2
+; RV32-NEXT: srli s1, t3, 1
+; RV32-NEXT: sll s0, s0, a2
+; RV32-NEXT: srl s1, s1, a5
+; RV32-NEXT: andi s3, a0, 1
+; RV32-NEXT: or s1, s0, s1
+; RV32-NEXT: or a0, a4, s3
+; RV32-NEXT: sll t3, t3, a2
+; RV32-NEXT: beqz a0, .LBB3_55
+; RV32-NEXT: # %bb.43: # %udiv-preheader
+; RV32-NEXT: sw zero, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw zero, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw zero, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: li s7, 0
+; RV32-NEXT: srli a3, a3, 1
+; RV32-NEXT: lw a0, 16(ra)
+; RV32-NEXT: sw zero, 104(sp)
+; RV32-NEXT: sw zero, 108(sp)
+; RV32-NEXT: sw zero, 112(sp)
+; RV32-NEXT: sw zero, 116(sp)
+; RV32-NEXT: sw zero, 88(sp)
+; RV32-NEXT: sw zero, 92(sp)
+; RV32-NEXT: sw zero, 96(sp)
+; RV32-NEXT: sw zero, 100(sp)
+; RV32-NEXT: sw s11, 72(sp)
+; RV32-NEXT: sw zero, 76(sp)
+; RV32-NEXT: sw zero, 80(sp)
+; RV32-NEXT: sw zero, 84(sp)
+; RV32-NEXT: sw a7, 56(sp)
+; RV32-NEXT: sw t0, 60(sp)
+; RV32-NEXT: sw a6, 64(sp)
+; RV32-NEXT: sw t1, 68(sp)
+; RV32-NEXT: srli a4, a1, 3
+; RV32-NEXT: addi a6, sp, 56
+; RV32-NEXT: andi a7, a1, 31
+; RV32-NEXT: or t0, s9, s10
+; RV32-NEXT: srl a3, a3, a5
+; RV32-NEXT: andi a4, a4, 28
+; RV32-NEXT: xori a5, a7, 31
+; RV32-NEXT: snez a7, t0
+; RV32-NEXT: add a4, a6, a4
+; RV32-NEXT: add a7, s8, a7
+; RV32-NEXT: lw a6, 16(a4)
+; RV32-NEXT: lw t0, 0(a4)
+; RV32-NEXT: lw t1, 4(a4)
+; RV32-NEXT: lw s0, 8(a4)
+; RV32-NEXT: lw a4, 12(a4)
+; RV32-NEXT: sll a0, a0, a2
+; RV32-NEXT: or a3, a0, a3
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: slli a0, a4, 1
+; RV32-NEXT: slli a2, s0, 1
+; RV32-NEXT: slli s4, t1, 1
+; RV32-NEXT: sll a6, a6, a5
+; RV32-NEXT: sll a0, a0, a5
+; RV32-NEXT: sll s8, a2, a5
+; RV32-NEXT: sll s4, s4, a5
+; RV32-NEXT: srl a2, a4, a1
+; RV32-NEXT: or ra, a2, a6
+; RV32-NEXT: lw a6, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: seqz a4, a6
+; RV32-NEXT: srl a2, s0, a1
+; RV32-NEXT: or a2, a2, a0
+; RV32-NEXT: lw a5, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: or a0, a6, a5
+; RV32-NEXT: sub s5, a5, a4
+; RV32-NEXT: seqz a4, a0
+; RV32-NEXT: srl a0, t1, a1
+; RV32-NEXT: or a0, a0, s8
+; RV32-NEXT: lw a5, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: sub t1, a5, a4
+; RV32-NEXT: sw t1, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT: sltu a4, a5, a4
+; RV32-NEXT: addi a7, a7, 1
+; RV32-NEXT: lw a5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: sub s6, a5, a4
+; RV32-NEXT: andi a4, a7, 1
+; RV32-NEXT: sw a4, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: andi a5, a3, 1
+; RV32-NEXT: srl a3, t0, a1
+; RV32-NEXT: or a4, a3, s4
+; RV32-NEXT: addi a6, a6, -1
+; RV32-NEXT: sw a6, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: li s11, 0
+; RV32-NEXT: li s10, 0
+; RV32-NEXT: j .LBB3_45
+; RV32-NEXT: .LBB3_44: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: and s0, a5, s0
+; RV32-NEXT: xor s8, t1, a7
+; RV32-NEXT: xor s9, a2, s0
+; RV32-NEXT: or s8, s9, s8
+; RV32-NEXT: li s9, 0
+; RV32-NEXT: li s8, 0
+; RV32-NEXT: sltu s4, a2, s0
+; RV32-NEXT: sub s0, a2, s0
+; RV32-NEXT: sub a7, t1, a7
+; RV32-NEXT: srli a2, s2, 31
+; RV32-NEXT: sub a0, a0, t0
+; RV32-NEXT: slli t0, t5, 1
+; RV32-NEXT: or t0, t0, a2
+; RV32-NEXT: srli a2, s1, 31
+; RV32-NEXT: slli s2, s2, 1
+; RV32-NEXT: or t1, s2, a2
+; RV32-NEXT: srli a2, t3, 31
+; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: or s1, s1, a2
+; RV32-NEXT: slli t3, t3, 1
+; RV32-NEXT: lw a2, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT: or t3, a2, t3
+; RV32-NEXT: srli a2, t5, 31
+; RV32-NEXT: or s7, s7, a2
+; RV32-NEXT: sub a2, s0, ra
+; RV32-NEXT: sltu s0, s0, ra
+; RV32-NEXT: or t5, a1, t6
+; RV32-NEXT: sub a7, a7, s4
+; RV32-NEXT: or s2, t2, t4
+; RV32-NEXT: sub a0, a0, a6
+; RV32-NEXT: or a6, a1, t2
+; RV32-NEXT: or s4, t5, s2
+; RV32-NEXT: seqz t5, a1
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: andi a5, a5, 1
+; RV32-NEXT: sw a5, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT: seqz a6, a6
+; RV32-NEXT: sub t2, t2, t5
+; RV32-NEXT: lw a5, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT: or s1, a5, s1
+; RV32-NEXT: lw a5, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT: or s2, a5, t1
+; RV32-NEXT: lw a5, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT: or t5, a5, t0
+; RV32-NEXT: andi a5, s7, 1
+; RV32-NEXT: sub ra, a7, s0
+; RV32-NEXT: snez a7, s4
+; RV32-NEXT: sltu t0, t6, a6
+; RV32-NEXT: sub t6, t6, a6
+; RV32-NEXT: add a7, s3, a7
+; RV32-NEXT: sub t4, t4, t0
+; RV32-NEXT: or a6, a1, t6
+; RV32-NEXT: addi a7, a7, 1
+; RV32-NEXT: or t0, t2, t4
+; RV32-NEXT: andi s3, a7, 1
+; RV32-NEXT: or a6, a6, t0
+; RV32-NEXT: or a6, a6, s3
+; RV32-NEXT: sub a4, a4, a3
+; RV32-NEXT: sw zero, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw zero, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: li s7, 0
+; RV32-NEXT: beqz a6, .LBB3_56
+; RV32-NEXT: .LBB3_45: # %udiv-do-while
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: srli a3, a2, 31
+; RV32-NEXT: slli a6, ra, 1
+; RV32-NEXT: or t1, a6, a3
+; RV32-NEXT: srli a3, a0, 31
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: or a2, a2, a3
+; RV32-NEXT: beq s6, t1, .LBB3_47
+; RV32-NEXT: # %bb.46: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: sltu a3, s6, t1
+; RV32-NEXT: j .LBB3_48
+; RV32-NEXT: .LBB3_47: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: lw a3, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a3, a3, a2
+; RV32-NEXT: .LBB3_48: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: srli a6, a4, 31
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: slli a4, a4, 1
+; RV32-NEXT: or a0, a0, a6
+; RV32-NEXT: andi a5, a5, 1
+; RV32-NEXT: or a4, a4, a5
+; RV32-NEXT: beq s5, a0, .LBB3_50
+; RV32-NEXT: # %bb.49: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: sltu a5, s5, a0
+; RV32-NEXT: j .LBB3_51
+; RV32-NEXT: .LBB3_50: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: lw a5, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a5, a5, a4
+; RV32-NEXT: .LBB3_51: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: lw a6, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: xor a6, a6, a2
+; RV32-NEXT: xor a7, s6, t1
+; RV32-NEXT: or a6, a6, a7
+; RV32-NEXT: beqz a6, .LBB3_53
+; RV32-NEXT: # %bb.52: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: mv a5, a3
+; RV32-NEXT: .LBB3_53: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: srli a3, ra, 31
+; RV32-NEXT: lw a6, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: sub a3, a6, a3
+; RV32-NEXT: sub a3, a3, a5
+; RV32-NEXT: slli a3, a3, 31
+; RV32-NEXT: srai a5, a3, 31
+; RV32-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: and a7, a5, a3
+; RV32-NEXT: lw a3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: and a3, a5, a3
+; RV32-NEXT: lw a6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: and t0, a5, a6
+; RV32-NEXT: sltu a6, a4, a3
+; RV32-NEXT: mv ra, a6
+; RV32-NEXT: beq a0, t0, .LBB3_44
+; RV32-NEXT: # %bb.54: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: sltu ra, a0, t0
+; RV32-NEXT: j .LBB3_44
+; RV32-NEXT: .LBB3_55:
+; RV32-NEXT: sw zero, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT: li s11, 0
+; RV32-NEXT: li s9, 0
+; RV32-NEXT: li s10, 0
+; RV32-NEXT: li s8, 0
+; RV32-NEXT: .LBB3_56: # %udiv-loop-exit
+; RV32-NEXT: srli a0, s2, 31
+; RV32-NEXT: slli a1, t5, 1
+; RV32-NEXT: or a0, a1, a0
+; RV32-NEXT: srli a1, s1, 31
+; RV32-NEXT: slli s2, s2, 1
+; RV32-NEXT: or a2, s2, a1
+; RV32-NEXT: srli a3, t3, 31
+; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: srli a4, t5, 31
+; RV32-NEXT: slli t3, t3, 1
+; RV32-NEXT: lw a1, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT: or a1, a1, t3
+; RV32-NEXT: or a3, s11, a3
+; RV32-NEXT: or a4, s8, a4
+; RV32-NEXT: or t5, a3, s1
+; RV32-NEXT: or t3, s9, a2
+; RV32-NEXT: or a5, s10, a0
+; RV32-NEXT: andi a2, a4, 1
+; RV32-NEXT: lw ra, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT: .LBB3_57: # %udiv-end
+; RV32-NEXT: sw a1, 0(ra)
+; RV32-NEXT: sw t5, 4(ra)
+; RV32-NEXT: sw t3, 8(ra)
+; RV32-NEXT: sw a5, 12(ra)
+; RV32-NEXT: andi a2, a2, 1
+; RV32-NEXT: sb a2, 16(ra)
+; RV32-NEXT: lw ra, 236(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 232(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 228(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 224(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 220(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 216(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 212(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 208(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 204(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 200(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 196(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 192(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 188(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 240
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i129:
+; RV64: # %bb.0: # %_udiv-special-cases
+; RV64-NEXT: ld a3, 0(a2)
+; RV64-NEXT: ld a4, 8(a2)
+; RV64-NEXT: ld t1, 16(a2)
+; RV64-NEXT: lui a2, 349525
+; RV64-NEXT: lui a5, 209715
+; RV64-NEXT: lui a6, 61681
+; RV64-NEXT: addi t0, a2, 1365
+; RV64-NEXT: addi a7, a5, 819
+; RV64-NEXT: addi a6, a6, -241
+; RV64-NEXT: slli a2, t0, 32
+; RV64-NEXT: slli a5, a7, 32
+; RV64-NEXT: slli t2, a6, 32
+; RV64-NEXT: add t0, t0, a2
+; RV64-NEXT: add a7, a7, a5
+; RV64-NEXT: add a6, a6, t2
+; RV64-NEXT: srli a2, a4, 1
+; RV64-NEXT: slli a5, t1, 63
+; RV64-NEXT: slli t2, a4, 63
+; RV64-NEXT: or t3, a5, a2
+; RV64-NEXT: srli a2, a3, 1
+; RV64-NEXT: or t4, a2, t2
+; RV64-NEXT: bnez t3, .LBB3_2
+; RV64-NEXT: # %bb.1: # %_udiv-special-cases
+; RV64-NEXT: srli a2, t4, 1
+; RV64-NEXT: or a2, t4, a2
+; RV64-NEXT: srli a5, a2, 2
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 4
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 8
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 16
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 32
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: not a2, a2
+; RV64-NEXT: srli a5, a2, 1
+; RV64-NEXT: and a5, a5, t0
+; RV64-NEXT: sub a2, a2, a5
+; RV64-NEXT: and a5, a2, a7
+; RV64-NEXT: srli a2, a2, 2
+; RV64-NEXT: and a2, a2, a7
+; RV64-NEXT: add a2, a5, a2
+; RV64-NEXT: srli a5, a2, 4
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: and a2, a2, a6
+; RV64-NEXT: slli a5, a2, 8
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: slli a5, a2, 16
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: slli a5, a2, 32
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: srli a2, a2, 56
+; RV64-NEXT: addi t2, a2, 64
+; RV64-NEXT: j .LBB3_3
+; RV64-NEXT: .LBB3_2:
+; RV64-NEXT: srli a2, t3, 1
+; RV64-NEXT: or a2, t3, a2
+; RV64-NEXT: srli a5, a2, 2
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 4
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 8
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 16
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 32
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: not a2, a2
+; RV64-NEXT: srli a5, a2, 1
+; RV64-NEXT: and a5, a5, t0
+; RV64-NEXT: sub a2, a2, a5
+; RV64-NEXT: and a5, a2, a7
+; RV64-NEXT: srli a2, a2, 2
+; RV64-NEXT: and a2, a2, a7
+; RV64-NEXT: add a2, a5, a2
+; RV64-NEXT: srli a5, a2, 4
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: and a2, a2, a6
+; RV64-NEXT: slli a5, a2, 8
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: slli a5, a2, 16
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: slli a5, a2, 32
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: srli t2, a2, 56
+; RV64-NEXT: .LBB3_3: # %_udiv-special-cases
+; RV64-NEXT: addi sp, sp, -192
+; RV64-NEXT: sd s0, 184(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s1, 176(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 168(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s3, 160(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s4, 152(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s5, 144(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s6, 136(sp) # 8-byte Folded Spill
+; RV64-NEXT: slli a2, a3, 63
+; RV64-NEXT: li t5, 128
+; RV64-NEXT: bnez a2, .LBB3_5
+; RV64-NEXT: # %bb.4: # %_udiv-special-cases
+; RV64-NEXT: li s0, 128
+; RV64-NEXT: j .LBB3_6
+; RV64-NEXT: .LBB3_5:
+; RV64-NEXT: srli a5, a2, 1
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 2
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 4
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 8
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 16
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 32
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: not a2, a2
+; RV64-NEXT: srli a5, a2, 1
+; RV64-NEXT: and a5, a5, t0
+; RV64-NEXT: sub a2, a2, a5
+; RV64-NEXT: and a5, a2, a7
+; RV64-NEXT: srli a2, a2, 2
+; RV64-NEXT: and a2, a2, a7
+; RV64-NEXT: add a2, a5, a2
+; RV64-NEXT: srli a5, a2, 4
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: and a2, a2, a6
+; RV64-NEXT: slli a5, a2, 8
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: slli a5, a2, 16
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: slli a5, a2, 32
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: srli s0, a2, 56
+; RV64-NEXT: .LBB3_6: # %_udiv-special-cases
+; RV64-NEXT: ld a5, 0(a1)
+; RV64-NEXT: ld a2, 8(a1)
+; RV64-NEXT: ld s2, 16(a1)
+; RV64-NEXT: or a1, t4, t3
+; RV64-NEXT: addi s1, s0, 128
+; RV64-NEXT: bnez a1, .LBB3_8
+; RV64-NEXT: # %bb.7: # %_udiv-special-cases
+; RV64-NEXT: mv t2, s1
+; RV64-NEXT: .LBB3_8: # %_udiv-special-cases
+; RV64-NEXT: snez s3, a1
+; RV64-NEXT: srli a1, a2, 1
+; RV64-NEXT: slli t3, s2, 63
+; RV64-NEXT: slli t4, a2, 63
+; RV64-NEXT: or a1, t3, a1
+; RV64-NEXT: srli t3, a5, 1
+; RV64-NEXT: or t6, t3, t4
+; RV64-NEXT: bnez a1, .LBB3_10
+; RV64-NEXT: # %bb.9: # %_udiv-special-cases
+; RV64-NEXT: srli t3, t6, 1
+; RV64-NEXT: or t3, t6, t3
+; RV64-NEXT: srli t4, t3, 2
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 4
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 8
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 16
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 32
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: not t3, t3
+; RV64-NEXT: srli t4, t3, 1
+; RV64-NEXT: and t4, t4, t0
+; RV64-NEXT: sub t3, t3, t4
+; RV64-NEXT: and t4, t3, a7
+; RV64-NEXT: srli t3, t3, 2
+; RV64-NEXT: and t3, t3, a7
+; RV64-NEXT: add t3, t4, t3
+; RV64-NEXT: srli t4, t3, 4
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: and t3, t3, a6
+; RV64-NEXT: slli t4, t3, 8
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: slli t4, t3, 16
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: slli t4, t3, 32
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: srli t3, t3, 56
+; RV64-NEXT: addi s4, t3, 64
+; RV64-NEXT: j .LBB3_11
+; RV64-NEXT: .LBB3_10:
+; RV64-NEXT: srli t3, a1, 1
+; RV64-NEXT: or t3, a1, t3
+; RV64-NEXT: srli t4, t3, 2
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 4
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 8
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 16
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 32
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: not t3, t3
+; RV64-NEXT: srli t4, t3, 1
+; RV64-NEXT: and t4, t4, t0
+; RV64-NEXT: sub t3, t3, t4
+; RV64-NEXT: and t4, t3, a7
+; RV64-NEXT: srli t3, t3, 2
+; RV64-NEXT: and t3, t3, a7
+; RV64-NEXT: add t3, t4, t3
+; RV64-NEXT: srli t4, t3, 4
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: and t3, t3, a6
+; RV64-NEXT: slli t4, t3, 8
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: slli t4, t3, 16
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: slli t4, t3, 32
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: srli s4, t3, 56
+; RV64-NEXT: .LBB3_11: # %_udiv-special-cases
+; RV64-NEXT: andi t4, s2, 1
+; RV64-NEXT: andi t1, t1, 1
+; RV64-NEXT: or t3, a3, a4
+; RV64-NEXT: or s2, a5, a2
+; RV64-NEXT: sltu s0, s1, s0
+; RV64-NEXT: slli s1, a5, 63
+; RV64-NEXT: addi s3, s3, -1
+; RV64-NEXT: beqz s1, .LBB3_13
+; RV64-NEXT: # %bb.12:
+; RV64-NEXT: srli t5, s1, 1
+; RV64-NEXT: or t5, s1, t5
+; RV64-NEXT: srli s1, t5, 2
+; RV64-NEXT: or t5, t5, s1
+; RV64-NEXT: srli s1, t5, 4
+; RV64-NEXT: or t5, t5, s1
+; RV64-NEXT: srli s1, t5, 8
+; RV64-NEXT: or t5, t5, s1
+; RV64-NEXT: srli s1, t5, 16
+; RV64-NEXT: or t5, t5, s1
+; RV64-NEXT: srli s1, t5, 32
+; RV64-NEXT: or t5, t5, s1
+; RV64-NEXT: not t5, t5
+; RV64-NEXT: srli s1, t5, 1
+; RV64-NEXT: and t0, s1, t0
+; RV64-NEXT: sub t0, t5, t0
+; RV64-NEXT: and t5, t0, a7
+; RV64-NEXT: srli t0, t0, 2
+; RV64-NEXT: and a7, t0, a7
+; RV64-NEXT: add a7, t5, a7
+; RV64-NEXT: srli t0, a7, 4
+; RV64-NEXT: add a7, a7, t0
+; RV64-NEXT: and a6, a7, a6
+; RV64-NEXT: slli a7, a6, 8
+; RV64-NEXT: add a6, a6, a7
+; RV64-NEXT: slli a7, a6, 16
+; RV64-NEXT: add a6, a6, a7
+; RV64-NEXT: slli a7, a6, 32
+; RV64-NEXT: add a6, a6, a7
+; RV64-NEXT: srli t5, a6, 56
+; RV64-NEXT: .LBB3_13: # %_udiv-special-cases
+; RV64-NEXT: or t0, t3, t1
+; RV64-NEXT: or a6, s2, t4
+; RV64-NEXT: and a7, s3, s0
+; RV64-NEXT: or t6, t6, a1
+; RV64-NEXT: addi s0, t5, 128
+; RV64-NEXT: bnez t6, .LBB3_15
+; RV64-NEXT: # %bb.14: # %_udiv-special-cases
+; RV64-NEXT: mv s4, s0
+; RV64-NEXT: .LBB3_15: # %_udiv-special-cases
+; RV64-NEXT: seqz a1, t0
+; RV64-NEXT: sltu t0, s0, t5
+; RV64-NEXT: snez t5, t6
+; RV64-NEXT: addi t5, t5, -1
+; RV64-NEXT: and t0, t5, t0
+; RV64-NEXT: sltu t5, t2, s4
+; RV64-NEXT: seqz a6, a6
+; RV64-NEXT: mv t6, t5
+; RV64-NEXT: beq a7, t0, .LBB3_17
+; RV64-NEXT: # %bb.16: # %_udiv-special-cases
+; RV64-NEXT: sltu t6, a7, t0
+; RV64-NEXT: .LBB3_17: # %_udiv-special-cases
+; RV64-NEXT: or a1, a1, a6
+; RV64-NEXT: andi a6, t6, 1
+; RV64-NEXT: sub a7, a7, t0
+; RV64-NEXT: sub t5, a7, t5
+; RV64-NEXT: sub a7, t2, s4
+; RV64-NEXT: beqz a6, .LBB3_19
+; RV64-NEXT: # %bb.18: # %_udiv-special-cases
+; RV64-NEXT: mv t0, a6
+; RV64-NEXT: j .LBB3_20
+; RV64-NEXT: .LBB3_19:
+; RV64-NEXT: sltiu t0, a7, 129
+; RV64-NEXT: xori t0, t0, 1
+; RV64-NEXT: snez t2, t5
+; RV64-NEXT: or t0, t0, t2
+; RV64-NEXT: .LBB3_20: # %_udiv-special-cases
+; RV64-NEXT: or t6, a1, t0
+; RV64-NEXT: addi a1, t6, -1
+; RV64-NEXT: and t2, t4, a1
+; RV64-NEXT: and t0, a1, a2
+; RV64-NEXT: and a1, a1, a5
+; RV64-NEXT: bnez t6, .LBB3_30
+; RV64-NEXT: # %bb.21: # %_udiv-special-cases
+; RV64-NEXT: xori t6, a7, 128
+; RV64-NEXT: or t6, t6, a6
+; RV64-NEXT: or t6, t6, t5
+; RV64-NEXT: beqz t6, .LBB3_30
+; RV64-NEXT: # %bb.22: # %udiv-bb1
+; RV64-NEXT: addi a1, a7, 1
+; RV64-NEXT: sd zero, 64(sp)
+; RV64-NEXT: sd zero, 72(sp)
+; RV64-NEXT: sd zero, 80(sp)
+; RV64-NEXT: sd zero, 88(sp)
+; RV64-NEXT: sd a5, 96(sp)
+; RV64-NEXT: sd a2, 104(sp)
+; RV64-NEXT: sd t4, 112(sp)
+; RV64-NEXT: li t0, 128
+; RV64-NEXT: addi t2, sp, 96
+; RV64-NEXT: neg s1, a7
+; RV64-NEXT: seqz t6, a1
+; RV64-NEXT: sub a7, t0, a7
+; RV64-NEXT: add t5, t5, t6
+; RV64-NEXT: andi t0, a7, 63
+; RV64-NEXT: srli a7, a7, 3
+; RV64-NEXT: or t6, a1, t5
+; RV64-NEXT: xori s2, t0, 63
+; RV64-NEXT: andi a7, a7, 24
+; RV64-NEXT: seqz t0, t6
+; RV64-NEXT: sub s3, t2, a7
+; RV64-NEXT: add a6, a6, t0
+; RV64-NEXT: ld t2, 0(s3)
+; RV64-NEXT: ld s4, 8(s3)
+; RV64-NEXT: andi a7, a6, 1
+; RV64-NEXT: or t6, t6, a7
+; RV64-NEXT: srli a6, t2, 1
+; RV64-NEXT: sll t0, s4, s1
+; RV64-NEXT: srl a6, a6, s2
+; RV64-NEXT: or t0, t0, a6
+; RV64-NEXT: sll a6, t2, s1
+; RV64-NEXT: li t2, 0
+; RV64-NEXT: beqz t6, .LBB3_28
+; RV64-NEXT: # %bb.23: # %udiv-preheader
+; RV64-NEXT: li t6, 0
+; RV64-NEXT: li s0, 0
+; RV64-NEXT: srli s4, s4, 1
+; RV64-NEXT: ld s3, 16(s3)
+; RV64-NEXT: sd zero, 32(sp)
+; RV64-NEXT: sd zero, 40(sp)
+; RV64-NEXT: sd zero, 48(sp)
+; RV64-NEXT: sd zero, 56(sp)
+; RV64-NEXT: sd a5, 0(sp)
+; RV64-NEXT: sd a2, 8(sp)
+; RV64-NEXT: sd t4, 16(sp)
+; RV64-NEXT: sd zero, 24(sp)
+; RV64-NEXT: srli a2, a1, 3
+; RV64-NEXT: srl a5, s4, s2
+; RV64-NEXT: mv t4, sp
+; RV64-NEXT: snez t3, t3
+; RV64-NEXT: andi a2, a2, 24
+; RV64-NEXT: add t1, t1, t3
+; RV64-NEXT: add a2, t4, a2
+; RV64-NEXT: ld t3, 0(a2)
+; RV64-NEXT: ld t4, 8(a2)
+; RV64-NEXT: ld a2, 16(a2)
+; RV64-NEXT: sll s1, s3, s1
+; RV64-NEXT: andi s2, a1, 63
+; RV64-NEXT: xori s2, s2, 63
+; RV64-NEXT: or s3, s1, a5
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: slli a5, t4, 1
+; RV64-NEXT: sll a2, a2, s2
+; RV64-NEXT: sll s2, a5, s2
+; RV64-NEXT: srl s1, t4, a1
+; RV64-NEXT: or s1, s1, a2
+; RV64-NEXT: seqz a2, a3
+; RV64-NEXT: sub a2, a4, a2
+; RV64-NEXT: addi a5, t1, 1
+; RV64-NEXT: andi a5, a5, 1
+; RV64-NEXT: andi s3, s3, 1
+; RV64-NEXT: srl t1, t3, a1
+; RV64-NEXT: or s2, t1, s2
+; RV64-NEXT: addi t1, a3, -1
+; RV64-NEXT: j .LBB3_26
+; RV64-NEXT: .LBB3_24: # %udiv-do-while
+; RV64-NEXT: # in Loop: Header=BB3_26 Depth=1
+; RV64-NEXT: sltu t3, a2, s4
+; RV64-NEXT: .LBB3_25: # %udiv-do-while
+; RV64-NEXT: # in Loop: Header=BB3_26 Depth=1
+; RV64-NEXT: srli s1, s1, 63
+; RV64-NEXT: sub t4, a5, s1
+; RV64-NEXT: sub t3, t4, t3
+; RV64-NEXT: slli t3, t3, 63
+; RV64-NEXT: srai s1, t3, 63
+; RV64-NEXT: and s3, s1, a4
+; RV64-NEXT: li t3, 0
+; RV64-NEXT: li t4, 0
+; RV64-NEXT: srli s5, a6, 63
+; RV64-NEXT: sub s4, s4, s3
+; RV64-NEXT: slli s3, t0, 1
+; RV64-NEXT: or s3, s3, s5
+; RV64-NEXT: srli t0, t0, 63
+; RV64-NEXT: slli a6, a6, 1
+; RV64-NEXT: or a6, t2, a6
+; RV64-NEXT: seqz t2, a1
+; RV64-NEXT: or s0, s0, t0
+; RV64-NEXT: or s5, a1, t5
+; RV64-NEXT: sub t5, t5, t2
+; RV64-NEXT: and s6, s1, a3
+; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: andi t2, s1, 1
+; RV64-NEXT: or t0, t6, s3
+; RV64-NEXT: sltu t6, s2, s6
+; RV64-NEXT: snez s5, s5
+; RV64-NEXT: andi s3, s0, 1
+; RV64-NEXT: sub s1, s4, t6
+; RV64-NEXT: add a7, a7, s5
+; RV64-NEXT: addi a7, a7, 1
+; RV64-NEXT: andi a7, a7, 1
+; RV64-NEXT: or t6, a1, t5
+; RV64-NEXT: or s4, t6, a7
+; RV64-NEXT: sub s2, s2, s6
+; RV64-NEXT: li t6, 0
+; RV64-NEXT: li s0, 0
+; RV64-NEXT: beqz s4, .LBB3_29
+; RV64-NEXT: .LBB3_26: # %udiv-do-while
+; RV64-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64-NEXT: srli t3, s2, 63
+; RV64-NEXT: slli t4, s1, 1
+; RV64-NEXT: slli s2, s2, 1
+; RV64-NEXT: or s4, t4, t3
+; RV64-NEXT: andi t3, s3, 1
+; RV64-NEXT: or s2, s2, t3
+; RV64-NEXT: bne a2, s4, .LBB3_24
+; RV64-NEXT: # %bb.27: # in Loop: Header=BB3_26 Depth=1
+; RV64-NEXT: sltu t3, t1, s2
+; RV64-NEXT: j .LBB3_25
+; RV64-NEXT: .LBB3_28:
+; RV64-NEXT: li t3, 0
+; RV64-NEXT: li t4, 0
+; RV64-NEXT: .LBB3_29: # %udiv-loop-exit
+; RV64-NEXT: srli a2, a6, 63
+; RV64-NEXT: slli a3, t0, 1
+; RV64-NEXT: srli a4, t0, 63
+; RV64-NEXT: slli a6, a6, 1
+; RV64-NEXT: or a1, t2, a6
+; RV64-NEXT: or a2, t3, a2
+; RV64-NEXT: or a4, t4, a4
+; RV64-NEXT: or t0, a2, a3
+; RV64-NEXT: andi t2, a4, 1
+; RV64-NEXT: .LBB3_30: # %udiv-end
+; RV64-NEXT: andi a2, t2, 1
+; RV64-NEXT: sd a1, 0(a0)
+; RV64-NEXT: sd t0, 8(a0)
+; RV64-NEXT: sb a2, 16(a0)
+; RV64-NEXT: ld s0, 184(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s1, 176(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 168(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s3, 160(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s4, 152(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s5, 144(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s6, 136(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 192
+; RV64-NEXT: ret
%res = udiv i129 %x, %y
ret i129 %res
}
diff --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
index 04a2268..314e1b4 100644
--- a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
+++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
@@ -1,5 +1,6 @@
; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
+; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH
target triple = "wasm32"
@@ -23,6 +24,10 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -47,6 +52,109 @@ for.body: ; preds = %entry, %for.body
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
+define hidden i32 @i32_mac_u8_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i32_mac_u8_s8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.mul
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: loop
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+entry:
+ %cmp7.not = icmp eq i32 %N, 0
+ br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %entry, %for.body
+ %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = sext i8 %0 to i32
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09
+ %1 = load i8, ptr %arrayidx1, align 1
+ %conv2 = zext i8 %1 to i32
+ %mul = mul nsw i32 %conv2, %conv
+ %add = add nsw i32 %mul, %res.08
+ %inc = add nuw i32 %i.09, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
; CHECK-LABEL: i32_mac_s16:
; CHECK: i32x4.load16x4_s 0:p2align=1
@@ -57,6 +165,12 @@ define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.add
entry:
%cmp7.not = icmp eq i32 %N, 0
@@ -116,6 +230,31 @@ define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
; MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -156,6 +295,14 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
; MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
entry:
%cmp6.not = icmp eq i32 %N, 0
br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
@@ -197,6 +344,15 @@ define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture n
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -235,6 +391,13 @@ define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -277,6 +440,17 @@ define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapt
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -335,6 +509,32 @@ define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
; MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
+
entry:
%cmp8.not = icmp eq i32 %N, 0
br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
@@ -375,6 +575,14 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
; MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
entry:
%cmp6.not = icmp eq i32 %N, 0
br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll
new file mode 100644
index 0000000..9716cbe
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s
+
+target triple = "wasm32"
+; relaxed_dot stands for relaxed_dot_i8x16_i7x16_s, as in td
+; relaxed_dot_add stands for i32x4.relaxed_dot_i8x16_i7x16_add_s, as in td
+
+define <8 x i16> @relaxed_dot_sext_1(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_sext_1:
+; CHECK: .functype relaxed_dot_sext_1 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $1
+; CHECK-NEXT: return $pop0
+ %sext1 = sext <16 x i8> %a to <16 x i16>
+ %sext2 = sext <16 x i8> %b to <16 x i16>
+ %mul = mul <16 x i16> %sext1, %sext2
+ %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %res = add <8 x i16> %shuffle1, %shuffle2
+ ret <8 x i16> %res
+}
+
+
+define <8 x i16> @relaxed_dot_sext_2(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_sext_2:
+; CHECK: .functype relaxed_dot_sext_2 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $1
+; CHECK-NEXT: return $pop0
+ %sext1 = sext <16 x i8> %a to <16 x i16>
+ %sext2 = sext <16 x i8> %b to <16 x i16>
+ %mul = mul <16 x i16> %sext1, %sext2
+ %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %res = add <8 x i16> %shuffle2, %shuffle1
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @relaxed_dot_sext_self(<16 x i8> %v) {
+; CHECK-LABEL: relaxed_dot_sext_self:
+; CHECK: .functype relaxed_dot_sext_self (v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $0
+; CHECK-NEXT: return $pop0
+ %sext = sext <16 x i8> %v to <16 x i16>
+ %mul = mul <16 x i16> %sext, %sext
+ %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %res = add <8 x i16> %shuffle1, %shuffle2
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @relaxed_dot_add_from_relaxed_dot(<16 x i8> %a, <16 x i8> %b, <4 x i32> %c) {
+; CHECK-LABEL: relaxed_dot_add_from_relaxed_dot:
+; CHECK: .functype relaxed_dot_add_from_relaxed_dot (v128, v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s $push0=, $0, $1, $2
+; CHECK-NEXT: return $pop0
+ %relaxed_dot_call = call <8 x i16> @llvm.wasm.relaxed.dot.i8x16.i7x16.signed(<16 x i8> %a, <16 x i8> %b)
+ %sext = call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> %relaxed_dot_call)
+ %res = add <4 x i32> %sext, %c
+ ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <8 x i16> @relaxed_dot_zext(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_zext:
+; CHECK: .functype relaxed_dot_zext (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i16x8.extmul_low_i8x16_u $push6=, $0, $1
+; CHECK-NEXT: local.tee $push5=, $2=, $pop6
+; CHECK-NEXT: i16x8.extmul_high_i8x16_u $push4=, $0, $1
+; CHECK-NEXT: local.tee $push3=, $1=, $pop4
+; CHECK-NEXT: i8x16.shuffle $push1=, $pop5, $pop3, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+; CHECK-NEXT: i8x16.shuffle $push0=, $2, $1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+; CHECK-NEXT: i16x8.add $push2=, $pop1, $pop0
+; CHECK-NEXT: return $pop2
+ %zext1 = zext <16 x i8> %a to <16 x i16>
+ %zext2 = zext <16 x i8> %b to <16 x i16>
+ %mul = mul <16 x i16> %zext1, %zext2
+ %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %res = add <8 x i16> %shuffle1, %shuffle2
+ ret <8 x i16> %res
+
+}
+
+; INFO: Negative test
+define <8 x i16> @relaxed_dot_wrong_shuffle(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_wrong_shuffle:
+; CHECK: .functype relaxed_dot_wrong_shuffle (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i16x8.extmul_low_i8x16_s $push1=, $0, $1
+; CHECK-NEXT: i16x8.extmul_high_i8x16_s $push0=, $0, $1
+; CHECK-NEXT: i16x8.add $push2=, $pop1, $pop0
+; CHECK-NEXT: return $pop2
+ %sext1 = sext <16 x i8> %a to <16 x i16>
+ %sext2 = sext <16 x i8> %b to <16 x i16>
+ %mul = mul <16 x i16> %sext1, %sext2
+ %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %res = add <8 x i16> %shuffle1, %shuffle2
+ ret <8 x i16> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index 5571519..c90344b8 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -502,11 +502,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
; SSE2-NEXT: psubw %xmm1, %xmm0
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,0,0]
; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0]
; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,16,0,8,8,0,0,0,0,0,2,0,2,0,0,0]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -517,7 +517,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
; SSE41-NEXT: psubw %xmm1, %xmm0
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,0,0]
; SSE41-NEXT: paddw %xmm1, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4096,2048,8,u,u,2,2,u]
; SSE41-NEXT: pmulhuw %xmm0, %xmm1
@@ -530,7 +530,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,0,0]
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4096,2048,8,u,u,2,2,u]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
@@ -541,7 +541,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,0,0]
; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
@@ -630,7 +630,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [171,0,0,0]
; SSE2-NEXT: psrlw $15, %xmm0
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
@@ -641,7 +641,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [171,0,0,0]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm2, %xmm2
; SSE41-NEXT: psrlw $7, %xmm2
@@ -654,7 +654,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; AVX-LABEL: combine_vec_udiv_nonuniform4:
; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [171,0,0,0]
; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX-NEXT: vpackuswb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
@@ -691,7 +691,7 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
; SSE2-NEXT: psubw %xmm3, %xmm0
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,32768,0,0,0,0,0,32768]
; SSE2-NEXT: paddw %xmm3, %xmm0
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,u,0,4,0,4,16,0,4,0,0,4,0,0,0,16]
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/cpus-intel.ll b/llvm/test/CodeGen/X86/cpus-intel.ll
index 71253c8..646629d 100644
--- a/llvm/test/CodeGen/X86/cpus-intel.ll
+++ b/llvm/test/CodeGen/X86/cpus-intel.ll
@@ -39,6 +39,7 @@
; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=gracemont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pantherlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=wildcatlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=novalake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=clearwaterforest 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=diamondrapids 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
@@ -106,6 +107,7 @@
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=gracemont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pantherlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=wildcatlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=novalake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=clearwaterforest 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=diamondrapids 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll
index c2b7068..df04b67 100644
--- a/llvm/test/CodeGen/X86/isel-fpclass.ll
+++ b/llvm/test/CodeGen/X86/isel-fpclass.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86,X86-SDAGISEL
+; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL
; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X86-FASTISEL
; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X64,X64-FASTISEL
-; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=1 | FileCheck %s -check-prefixes=X86,X86-GISEL
-; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=1 | FileCheck %s -check-prefixes=X64-GISEL
+; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=2 | FileCheck %s -check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=2 | FileCheck %s -check-prefixes=X64,X64-GISEL
define i1 @isnone_f(float %x) nounwind {
; X86-LABEL: isnone_f:
@@ -23,11 +23,6 @@ define i1 @isnone_f(float %x) nounwind {
; X86-FASTISEL-NEXT: fstp %st(0)
; X86-FASTISEL-NEXT: xorl %eax, %eax
; X86-FASTISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: isnone_f:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: xorl %eax, %eax
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 0)
ret i1 %0
@@ -50,27 +45,22 @@ define i1 @isany_f(float %x) nounwind {
; X86-FASTISEL-NEXT: fstp %st(0)
; X86-FASTISEL-NEXT: movb $1, %al
; X86-FASTISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: isany_f:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: movb $1, %al
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1023)
ret i1 %0
}
define i1 @issignaling_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: issignaling_f:
-; X86-SDAGISEL: # %bb.0:
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT: setl %cl
-; X86-SDAGISEL-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-SDAGISEL-NEXT: setge %al
-; X86-SDAGISEL-NEXT: andb %cl, %al
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: issignaling_f:
+; X86: # %bb.0:
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT: setl %cl
+; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT: setge %al
+; X86-NEXT: andb %cl, %al
+; X86-NEXT: retl
;
; X64-LABEL: issignaling_f:
; X64: # %bb.0:
@@ -97,44 +87,18 @@ define i1 @issignaling_f(float %x) nounwind {
; X86-FASTISEL-NEXT: andb %cl, %al
; X86-FASTISEL-NEXT: popl %ecx
; X86-FASTISEL-NEXT: retl
-;
-; X86-GISEL-LABEL: issignaling_f:
-; X86-GISEL: # %bb.0:
-; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT: xorl %ecx, %ecx
-; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT: seta %dl
-; X86-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-GISEL-NEXT: setb %al
-; X86-GISEL-NEXT: andb %dl, %al
-; X86-GISEL-NEXT: orb %cl, %al
-; X86-GISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: issignaling_f:
-; X64-GISEL: # %bb.0:
-; X64-GISEL-NEXT: movd %xmm0, %eax
-; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT: xorl %ecx, %ecx
-; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT: seta %dl
-; X64-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
-; X64-GISEL-NEXT: setb %al
-; X64-GISEL-NEXT: andb %dl, %al
-; X64-GISEL-NEXT: orb %cl, %al
-; X64-GISEL-NEXT: retq
%a0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1) ; "snan"
ret i1 %a0
}
define i1 @isquiet_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: isquiet_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT: setge %al
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: isquiet_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT: setge %al
+; X86-NEXT: retl
;
; X64-LABEL: isquiet_f:
; X64: # %bb.0: # %entry
@@ -155,39 +119,19 @@ define i1 @issignaling_f(float %x) nounwind {
; X86-FASTISEL-NEXT: setge %al
; X86-FASTISEL-NEXT: popl %ecx
; X86-FASTISEL-NEXT: retl
-;
-; X86-GISEL-LABEL: isquiet_f:
-; X86-GISEL: # %bb.0: # %entry
-; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT: xorl %ecx, %ecx
-; X86-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-GISEL-NEXT: setae %al
-; X86-GISEL-NEXT: orb %cl, %al
-; X86-GISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: isquiet_f:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: movd %xmm0, %eax
-; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT: xorl %ecx, %ecx
-; X64-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
-; X64-GISEL-NEXT: setae %al
-; X64-GISEL-NEXT: orb %cl, %al
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2) ; "qnan"
ret i1 %0
}
define i1 @not_isquiet_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: not_isquiet_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT: setl %al
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: not_isquiet_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT: setl %al
+; X86-NEXT: retl
;
; X64-LABEL: not_isquiet_f:
; X64: # %bb.0: # %entry
@@ -208,57 +152,19 @@ define i1 @not_isquiet_f(float %x) nounwind {
; X86-FASTISEL-NEXT: setl %al
; X86-FASTISEL-NEXT: popl %ecx
; X86-FASTISEL-NEXT: retl
-;
-; X86-GISEL-LABEL: not_isquiet_f:
-; X86-GISEL: # %bb.0: # %entry
-; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT: xorl %ecx, %ecx
-; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT: setb %dl
-; X86-GISEL-NEXT: orb %cl, %dl
-; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT: sete %cl
-; X86-GISEL-NEXT: orb %dl, %cl
-; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT: seta %dl
-; X86-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-GISEL-NEXT: setb %al
-; X86-GISEL-NEXT: andb %dl, %al
-; X86-GISEL-NEXT: orb %cl, %al
-; X86-GISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: not_isquiet_f:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: movd %xmm0, %eax
-; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT: xorl %ecx, %ecx
-; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT: setb %dl
-; X64-GISEL-NEXT: orb %cl, %dl
-; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT: sete %cl
-; X64-GISEL-NEXT: orb %dl, %cl
-; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT: seta %dl
-; X64-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000
-; X64-GISEL-NEXT: setb %al
-; X64-GISEL-NEXT: andb %dl, %al
-; X64-GISEL-NEXT: orb %cl, %al
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021) ; ~"qnan"
ret i1 %0
}
define i1 @isinf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: isinf_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT: sete %al
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: isinf_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT: sete %al
+; X86-NEXT: retl
;
; X64-LABEL: isinf_f:
; X64: # %bb.0: # %entry
@@ -279,39 +185,19 @@ define i1 @isinf_f(float %x) nounwind {
; X86-FASTISEL-NEXT: sete %al
; X86-FASTISEL-NEXT: popl %ecx
; X86-FASTISEL-NEXT: retl
-;
-; X86-GISEL-LABEL: isinf_f:
-; X86-GISEL: # %bb.0: # %entry
-; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT: xorl %ecx, %ecx
-; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT: sete %al
-; X86-GISEL-NEXT: orb %cl, %al
-; X86-GISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: isinf_f:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: movd %xmm0, %eax
-; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT: xorl %ecx, %ecx
-; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT: sete %al
-; X64-GISEL-NEXT: orb %cl, %al
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516) ; 0x204 = "inf"
ret i1 %0
}
define i1 @not_isinf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: not_isinf_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT: setne %al
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: not_isinf_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT: setne %al
+; X86-NEXT: retl
;
; X64-LABEL: not_isinf_f:
; X64: # %bb.0: # %entry
@@ -332,43 +218,17 @@ define i1 @not_isinf_f(float %x) nounwind {
; X86-FASTISEL-NEXT: setne %al
; X86-FASTISEL-NEXT: popl %ecx
; X86-FASTISEL-NEXT: retl
-;
-; X86-GISEL-LABEL: not_isinf_f:
-; X86-GISEL: # %bb.0: # %entry
-; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT: xorl %ecx, %ecx
-; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT: setb %dl
-; X86-GISEL-NEXT: orb %cl, %dl
-; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT: seta %al
-; X86-GISEL-NEXT: orb %dl, %al
-; X86-GISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: not_isinf_f:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: movd %xmm0, %eax
-; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT: xorl %ecx, %ecx
-; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT: setb %dl
-; X64-GISEL-NEXT: orb %cl, %dl
-; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT: seta %al
-; X64-GISEL-NEXT: orb %dl, %al
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507) ; ~0x204 = "~inf"
ret i1 %0
}
define i1 @is_plus_inf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: is_plus_inf_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-SDAGISEL-NEXT: sete %al
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: is_plus_inf_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT: sete %al
+; X86-NEXT: retl
;
; X64-LABEL: is_plus_inf_f:
; X64: # %bb.0: # %entry
@@ -386,34 +246,17 @@ define i1 @is_plus_inf_f(float %x) nounwind {
; X86-FASTISEL-NEXT: sete %al
; X86-FASTISEL-NEXT: popl %ecx
; X86-FASTISEL-NEXT: retl
-;
-; X86-GISEL-LABEL: is_plus_inf_f:
-; X86-GISEL: # %bb.0: # %entry
-; X86-GISEL-NEXT: xorl %ecx, %ecx
-; X86-GISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-GISEL-NEXT: sete %al
-; X86-GISEL-NEXT: orb %cl, %al
-; X86-GISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: is_plus_inf_f:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: xorl %ecx, %ecx
-; X64-GISEL-NEXT: movd %xmm0, %eax
-; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT: sete %al
-; X64-GISEL-NEXT: orb %cl, %al
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512) ; 0x200 = "+inf"
ret i1 %0
}
define i1 @is_minus_inf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: is_minus_inf_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-SDAGISEL-NEXT: sete %al
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: is_minus_inf_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-NEXT: sete %al
+; X86-NEXT: retl
;
; X64-LABEL: is_minus_inf_f:
; X64: # %bb.0: # %entry
@@ -431,34 +274,17 @@ define i1 @is_minus_inf_f(float %x) nounwind {
; X86-FASTISEL-NEXT: sete %al
; X86-FASTISEL-NEXT: popl %ecx
; X86-FASTISEL-NEXT: retl
-;
-; X86-GISEL-LABEL: is_minus_inf_f:
-; X86-GISEL: # %bb.0: # %entry
-; X86-GISEL-NEXT: xorl %ecx, %ecx
-; X86-GISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-GISEL-NEXT: sete %al
-; X86-GISEL-NEXT: orb %cl, %al
-; X86-GISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: is_minus_inf_f:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: xorl %ecx, %ecx
-; X64-GISEL-NEXT: movd %xmm0, %eax
-; X64-GISEL-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000
-; X64-GISEL-NEXT: sete %al
-; X64-GISEL-NEXT: orb %cl, %al
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4) ; "-inf"
ret i1 %0
}
define i1 @not_is_minus_inf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: not_is_minus_inf_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-SDAGISEL-NEXT: setne %al
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: not_is_minus_inf_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-NEXT: setne %al
+; X86-NEXT: retl
;
; X64-LABEL: not_is_minus_inf_f:
; X64: # %bb.0: # %entry
@@ -476,55 +302,19 @@ define i1 @not_is_minus_inf_f(float %x) nounwind {
; X86-FASTISEL-NEXT: setne %al
; X86-FASTISEL-NEXT: popl %ecx
; X86-FASTISEL-NEXT: retl
-;
-; X86-GISEL-LABEL: not_is_minus_inf_f:
-; X86-GISEL: # %bb.0: # %entry
-; X86-GISEL-NEXT: pushl %ebx
-; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT: movl %eax, %ecx
-; X86-GISEL-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT: xorl %edx, %edx
-; X86-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000
-; X86-GISEL-NEXT: setb %bl
-; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT: sete %ah
-; X86-GISEL-NEXT: orb %dl, %ah
-; X86-GISEL-NEXT: orb %bl, %ah
-; X86-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000
-; X86-GISEL-NEXT: seta %al
-; X86-GISEL-NEXT: orb %ah, %al
-; X86-GISEL-NEXT: popl %ebx
-; X86-GISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: not_is_minus_inf_f:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: movd %xmm0, %eax
-; X64-GISEL-NEXT: movl %eax, %ecx
-; X64-GISEL-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT: xorl %edx, %edx
-; X64-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000
-; X64-GISEL-NEXT: setb %sil
-; X64-GISEL-NEXT: orb %dl, %sil
-; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT: sete %dl
-; X64-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000
-; X64-GISEL-NEXT: seta %al
-; X64-GISEL-NEXT: orb %dl, %al
-; X64-GISEL-NEXT: orb %sil, %al
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019) ; ~"-inf"
ret i1 %0
}
define i1 @isfinite_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: isfinite_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT: setl %al
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: isfinite_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT: setl %al
+; X86-NEXT: retl
;
; X64-LABEL: isfinite_f:
; X64: # %bb.0: # %entry
@@ -545,39 +335,19 @@ define i1 @isfinite_f(float %x) nounwind {
; X86-FASTISEL-NEXT: setl %al
; X86-FASTISEL-NEXT: popl %ecx
; X86-FASTISEL-NEXT: retl
-;
-; X86-GISEL-LABEL: isfinite_f:
-; X86-GISEL: # %bb.0: # %entry
-; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT: xorl %ecx, %ecx
-; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT: setb %al
-; X86-GISEL-NEXT: orb %cl, %al
-; X86-GISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: isfinite_f:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: movd %xmm0, %eax
-; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT: xorl %ecx, %ecx
-; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT: setb %al
-; X64-GISEL-NEXT: orb %cl, %al
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504) ; 0x1f8 = "finite"
ret i1 %0
}
define i1 @not_isfinite_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: not_isfinite_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT: setge %al
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: not_isfinite_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT: setge %al
+; X86-NEXT: retl
;
; X64-LABEL: not_isfinite_f:
; X64: # %bb.0: # %entry
@@ -598,43 +368,17 @@ define i1 @not_isfinite_f(float %x) nounwind {
; X86-FASTISEL-NEXT: setge %al
; X86-FASTISEL-NEXT: popl %ecx
; X86-FASTISEL-NEXT: retl
-;
-; X86-GISEL-LABEL: not_isfinite_f:
-; X86-GISEL: # %bb.0: # %entry
-; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT: xorl %ecx, %ecx
-; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT: sete %dl
-; X86-GISEL-NEXT: orb %cl, %dl
-; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT: seta %al
-; X86-GISEL-NEXT: orb %dl, %al
-; X86-GISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: not_isfinite_f:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: movd %xmm0, %eax
-; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT: xorl %ecx, %ecx
-; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT: sete %dl
-; X64-GISEL-NEXT: orb %cl, %dl
-; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT: seta %al
-; X64-GISEL-NEXT: orb %dl, %al
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519) ; ~0x1f8 = "~finite"
ret i1 %0
}
define i1 @is_plus_finite_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: is_plus_finite_f:
-; X86-SDAGISEL: # %bb.0: # %entry
-; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-SDAGISEL-NEXT: setb %al
-; X86-SDAGISEL-NEXT: retl
+; X86-LABEL: is_plus_finite_f:
+; X86: # %bb.0: # %entry
+; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT: setb %al
+; X86-NEXT: retl
;
; X64-LABEL: is_plus_finite_f:
; X64: # %bb.0: # %entry
@@ -652,23 +396,6 @@ define i1 @is_plus_finite_f(float %x) nounwind {
; X86-FASTISEL-NEXT: setb %al
; X86-FASTISEL-NEXT: popl %ecx
; X86-FASTISEL-NEXT: retl
-;
-; X86-GISEL-LABEL: is_plus_finite_f:
-; X86-GISEL: # %bb.0: # %entry
-; X86-GISEL-NEXT: xorl %ecx, %ecx
-; X86-GISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-GISEL-NEXT: setb %al
-; X86-GISEL-NEXT: orb %cl, %al
-; X86-GISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: is_plus_finite_f:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: xorl %ecx, %ecx
-; X64-GISEL-NEXT: movd %xmm0, %eax
-; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT: setb %al
-; X64-GISEL-NEXT: orb %cl, %al
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448) ; 0x1c0 = "+finite"
ret i1 %0
@@ -691,11 +418,6 @@ define i1 @isnone_d(double %x) nounwind {
; X86-FASTISEL-NEXT: fstp %st(0)
; X86-FASTISEL-NEXT: xorl %eax, %eax
; X86-FASTISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: isnone_d:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: xorl %eax, %eax
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 0)
ret i1 %0
@@ -718,11 +440,6 @@ define i1 @isany_d(double %x) nounwind {
; X86-FASTISEL-NEXT: fstp %st(0)
; X86-FASTISEL-NEXT: movb $1, %al
; X86-FASTISEL-NEXT: retl
-;
-; X64-GISEL-LABEL: isany_d:
-; X64-GISEL: # %bb.0: # %entry
-; X64-GISEL-NEXT: movb $1, %al
-; X64-GISEL-NEXT: retq
entry:
%0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 1023)
ret i1 %0
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index bdb7c30..4ec54d8 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -2071,7 +2071,7 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,7,42,32]
; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vphaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index d752659..04f0a65 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-NOVBMI
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-VBMI
; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
@@ -883,6 +883,30 @@ define <16 x i16> @test_16f32tosb_512(ptr %ptr, <16 x i16> %passthru) "min-legal
}
define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="256" {
+; CHECK-SKX-NOVBMI-LABEL: mul256:
+; CHECK-SKX-NOVBMI: # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rdi), %ymm1
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa (%rsi), %ymm2
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rsi), %ymm3
+; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm3, %ymm5
+; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
+; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
+; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm1, %ymm1
+; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4)
+; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm2, %ymm3
+; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
+; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm2, %ymm4, %ymm2
+; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm0, %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4)
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm0, (%rdx)
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
+; CHECK-SKX-NOVBMI-NEXT: vzeroupper
+; CHECK-SKX-NOVBMI-NEXT: retq
+;
; CHECK-SKX-VBMI-LABEL: mul256:
; CHECK-SKX-VBMI: # %bb.0:
; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm0
@@ -960,6 +984,21 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
}
define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="512" {
+; CHECK-SKX-NOVBMI-LABEL: mul512:
+; CHECK-SKX-NOVBMI: # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rsi), %zmm1
+; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-NOVBMI-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
+; CHECK-SKX-NOVBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm1
+; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
+; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %zmm0, %zmm0
+; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 %zmm0, (%rdx)
+; CHECK-SKX-NOVBMI-NEXT: vzeroupper
+; CHECK-SKX-NOVBMI-NEXT: retq
+;
; CHECK-SKX-VBMI-LABEL: mul512:
; CHECK-SKX-VBMI: # %bb.0:
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
@@ -1137,6 +1176,14 @@ define <16 x i16> @trunc_v16i32_v16i16_zeroes(ptr %x) nounwind "min-legal-vector
}
define <32 x i8> @trunc_v32i16_v32i8_zeroes(ptr %x) nounwind "min-legal-vector-width"="256" {
+; CHECK-SKX-NOVBMI-LABEL: trunc_v32i16_v32i8_zeroes:
+; CHECK-SKX-NOVBMI: # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, 32(%rdi), %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, (%rdi), %ymm1
+; CHECK-SKX-NOVBMI-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-SKX-NOVBMI-NEXT: retq
+;
; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_zeroes:
; CHECK-SKX-VBMI: # %bb.0:
; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1
@@ -1192,6 +1239,14 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(ptr %x) nounwind "min-legal-vector-w
}
define <32 x i8> @trunc_v32i16_v32i8_sign(ptr %x) nounwind "min-legal-vector-width"="256" {
+; CHECK-SKX-NOVBMI-LABEL: trunc_v32i16_v32i8_sign:
+; CHECK-SKX-NOVBMI: # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, 32(%rdi), %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, (%rdi), %ymm1
+; CHECK-SKX-NOVBMI-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-SKX-NOVBMI-NEXT: retq
+;
; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_sign:
; CHECK-SKX-VBMI: # %bb.0:
; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1
diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
index cc4bda8..650b562 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X86
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X64
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,SSE2
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=X64,SSE41
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,AVX1
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index 7c1a1e2..874d885 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X86
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X64
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,SSE2
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=X64,SSE41
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,AVX1
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 6174011..83a0ddb 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -5,9 +5,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI
@@ -598,6 +598,33 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: var_shuffle_zero_v8i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; AVX512F-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: var_shuffle_zero_v8i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpcmpnleuw %zmm2, %zmm1, %k1
+; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; AVX512BW-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
; AVX512VL-LABEL: var_shuffle_zero_v8i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
@@ -921,6 +948,28 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: var_shuffle_zero_v16i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: var_shuffle_zero_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpcmpnleub %zmm2, %zmm1, %k1
+; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
; AVX512VL-LABEL: var_shuffle_zero_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 9b52857..d16b28a 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1872,7 +1872,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: paddw %xmm0, %xmm0
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
@@ -1964,7 +1964,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
; X86-SSE2-NEXT: pandn %xmm1, %xmm2
-; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
; X86-SSE2-NEXT: por %xmm1, %xmm2
; X86-SSE2-NEXT: paddw %xmm0, %xmm0
; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 983ae59..3d85d55 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -851,7 +851,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: paddw %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index d565ef0..1602cde 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -1673,7 +1673,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,128,0,16,0,2,0,32,0,64,0,0,0,8,0,4]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
@@ -1750,7 +1750,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind {
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
; X86-SSE-NEXT: pandn %xmm0, %xmm1
-; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,128,0,16,0,2,0,32,0,64,0,0,0,8,0,4]
; X86-SSE-NEXT: por %xmm1, %xmm0
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 8cb2c7b..a847da6 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -1223,7 +1223,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -1275,7 +1275,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; X86-SSE-NEXT: pandn %xmm0, %xmm1
-; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
; X86-SSE-NEXT: por %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index 57874c4..eb39b6a 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1480,7 +1480,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -1532,7 +1532,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; X86-SSE-NEXT: pandn %xmm0, %xmm1
-; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u]
; X86-SSE-NEXT: por %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>