diff options
Diffstat (limited to 'llvm/test')
91 files changed, 6416 insertions, 1957 deletions
diff --git a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll index aeeb21e..ab1945d 100644 --- a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll +++ b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll @@ -150,7 +150,7 @@ define void @test_typedbuffer() { ; CHECK: Kind: CBuffer ; CHECK: CBuffer size: 4 - %cb1 = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) + %cb1 = call target("dx.CBuffer", <{ [2 x <{ float, target("dx.Padding", 12) }>], float }>) @llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, ptr @Constants.str) ; CHECK: Resource [[CB1:[0-9]+]]: ; CHECK: Name: Constants @@ -161,7 +161,7 @@ define void @test_typedbuffer() { ; CHECK: Size: 1 ; CHECK: Class: CBV ; CHECK: Kind: CBuffer - ; CHECK: CBuffer size: 4 + ; CHECK: CBuffer size: 36 ; CHECK-NOT: Resource {{[0-9]+}}: diff --git a/llvm/test/Analysis/ScalarEvolution/ne-guard-multiple-trip-count.ll b/llvm/test/Analysis/ScalarEvolution/ne-guard-multiple-trip-count.ll new file mode 100644 index 0000000..220c5a1 --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/ne-guard-multiple-trip-count.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='print<scalar-evolution>' -disable-output %s 2>&1 | FileCheck %s + +declare void @foo() + +; Tests with multiple guards for the same value and different values. + +define void @test_guard_order_b_then_c_and_d(ptr %a, ptr %b, ptr %c, ptr %d) { +; CHECK-LABEL: 'test_guard_order_b_then_c_and_d' +; CHECK-NEXT: Classifying expressions for: @test_guard_order_b_then_c_and_d +; CHECK-NEXT: %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: --> {%a,+,1}<%loop> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.next = getelementptr i8, ptr %iv, i64 1 +; CHECK-NEXT: --> {(1 + %a),+,1}<%loop> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @test_guard_order_b_then_c_and_d +; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i64 -2 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +; +entry: + %cmp.eq.b = icmp ne ptr %a, %b + %cmp.eq.c = icmp ne ptr %a, %c + %cmp.eq.d = icmp ne ptr %b, %d + call void @llvm.assume(i1 %cmp.eq.b) + call void @llvm.assume(i1 %cmp.eq.c) + call void @llvm.assume(i1 %cmp.eq.d) + br label %loop + +loop: + %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ] + %iv.next = getelementptr i8, ptr %iv, i64 1 + call void @foo() + %ec = icmp eq ptr %iv.next, %b + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_guard_order_d_then_c_and_b(ptr %a, ptr %b, ptr %c, ptr %d) { +; CHECK-LABEL: 'test_guard_order_d_then_c_and_b' +; CHECK-NEXT: Classifying expressions for: @test_guard_order_d_then_c_and_b +; CHECK-NEXT: %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: --> {%a,+,1}<%loop> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.next = getelementptr i8, ptr %iv, i64 1 +; CHECK-NEXT: --> {(1 + %a),+,1}<%loop> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @test_guard_order_d_then_c_and_b +; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i64 -2 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +; +entry: + %cmp.eq.b = icmp ne ptr %a, %b + %cmp.eq.c = icmp ne ptr %a, %c + %cmp.eq.d = icmp ne ptr %b, %d + call void @llvm.assume(i1 %cmp.eq.d) + call void @llvm.assume(i1 %cmp.eq.c) + call void @llvm.assume(i1 %cmp.eq.b) + br label %loop + +loop: + %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ] + %iv.next = getelementptr i8, ptr %iv, i64 1 + call void @foo() + %ec = icmp eq ptr %iv.next, %b + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll index a0f1b71..bb362d2 100644 --- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr128 | FileCheck %s -check-prefixes=NOZCM-FPR128-ATTR --match-full-lines ; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr128 | FileCheck %s -check-prefixes=ZCM-FPR128-ATTR --match-full-lines -define void @zero_cycle_regmov_FPR64(double %a, double %b, double %c, double %d) { +define void @zero_cycle_regmove_FPR64(double %a, double %b, double %c, double %d) { entry: ; CHECK-LABEL: t: ; NOZCM-FPR128-CPU: fmov d0, d2 @@ -45,7 +45,7 @@ entry: declare float @foo_double(double, double) -define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) { +define void @zero_cycle_regmove_FPR32(float %a, float %b, float %c, float %d) { entry: ; CHECK-LABEL: t: ; NOZCM-FPR128-CPU: fmov s0, s2 @@ -86,7 +86,7 @@ entry: declare float @foo_float(float, float) -define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) { +define void @zero_cycle_regmove_FPR16(half %a, half %b, half %c, half %d) { entry: ; CHECK-LABEL: t: ; NOZCM-FPR128-CPU: fmov s0, s2 diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll index e14e69b..d6d3f15 100644 --- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-gpr64 | FileCheck %s -check-prefixes=NOTATTR --match-full-lines ; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-gpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines -define void @zero_cycle_regmov_GPR32(i32 %a, i32 %b, i32 %c, i32 %d) { +define void @zero_cycle_regmove_GPR32(i32 %a, i32 %b, i32 %c, i32 %d) { entry: ; CHECK-LABEL: t: ; NOTCPU-LINUX: mov w0, w2 diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll index 15ee6a0..36655f6 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll @@ -359,12 +359,177 @@ define float @fadd_reduct_reassoc_v4v8f32(<vscale x 4 x float> %a, <vscale x 8 x ret float %r } +; No FMULV instruction so use knowledge about the architectural maximum size of +; an SVE register to "scalarise" the reduction. + +define half @fmulv_nxv2f16(half %init, <vscale x 2 x half> %a) { +; CHECK-LABEL: fmulv_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov z2.h, #1.00000000 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: fmul h0, h0, h1 +; CHECK-NEXT: ret + %res = call fast half @llvm.vector.reduce.fmul.nxv2f16(half %init, <vscale x 2 x half> %a) + ret half %res +} + +define half @fmulv_nxv4f16(half %init, <vscale x 4 x half> %a) { +; CHECK-LABEL: fmulv_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov z2.h, #1.00000000 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: fmul h0, h0, h1 +; CHECK-NEXT: ret + %res = call fast half @llvm.vector.reduce.fmul.nxv4f16(half %init, <vscale x 4 x half> %a) + ret half %res +} + +define half @fmulv_nxv8f16(half %init, <vscale x 8 x half> %a) { +; CHECK-LABEL: fmulv_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov z2.h, #1.00000000 +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: fmul h0, h0, h1 +; CHECK-NEXT: ret + %res = call fast half @llvm.vector.reduce.fmul.nxv8f16(half %init, <vscale x 8 x half> %a) + ret half %res +} + +define float @fmulv_nxv2f32(float %init, <vscale x 2 x float> %a) { +; CHECK-LABEL: fmulv_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov z2.s, #1.00000000 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: fmul s0, s0, s1 +; CHECK-NEXT: ret + %res = call fast float @llvm.vector.reduce.fmul.nxv2f32(float %init, <vscale x 2 x float> %a) + ret float %res +} + +define float @fmulv_nxv4f32(float %init, <vscale x 4 x float> %a) { +; CHECK-LABEL: fmulv_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov z2.s, #1.00000000 +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z3.s +; CHECK-NEXT: fmul s0, s0, s1 +; CHECK-NEXT: ret + %res = call fast float @llvm.vector.reduce.fmul.nxv4f32(float %init, <vscale x 4 x float> %a) + ret float %res +} + +define double @fmulv_nxv2f64(double %init, <vscale x 2 x double> %a) { +; CHECK-LABEL: fmulv_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov z2.d, #1.00000000 +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z3.d +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z3.d +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z3.d +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z3.d +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z3.d +; CHECK-NEXT: fmul d0, d0, d1 +; CHECK-NEXT: ret + %res = call fast double @llvm.vector.reduce.fmul.nxv2f64(double %init, <vscale x 2 x double> %a) + ret double %res +} + declare half @llvm.vector.reduce.fadd.nxv2f16(half, <vscale x 2 x half>) declare half @llvm.vector.reduce.fadd.nxv4f16(half, <vscale x 4 x half>) declare half @llvm.vector.reduce.fadd.nxv8f16(half, <vscale x 8 x half>) -declare half @llvm.vector.reduce.fadd.nxv6f16(half, <vscale x 6 x half>) -declare half @llvm.vector.reduce.fadd.nxv10f16(half, <vscale x 10 x half>) -declare half @llvm.vector.reduce.fadd.nxv12f16(half, <vscale x 12 x half>) declare float @llvm.vector.reduce.fadd.nxv2f32(float, <vscale x 2 x float>) declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>) declare float @llvm.vector.reduce.fadd.nxv8f32(float, <vscale x 8 x float>) @@ -397,3 +562,10 @@ declare half @llvm.vector.reduce.fminimum.nxv8f16(<vscale x 8 x half>) declare float @llvm.vector.reduce.fminimum.nxv2f32(<vscale x 2 x float>) declare float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float>) declare double @llvm.vector.reduce.fminimum.nxv2f64(<vscale x 2 x double>) + +declare half @llvm.vector.reduce.fmul.nxv2f16(half, <vscale x 2 x half>) +declare half @llvm.vector.reduce.fmul.nxv4f16(half, <vscale x 4 x half>) +declare half @llvm.vector.reduce.fmul.nxv8f16(half, <vscale x 8 x half>) +declare float @llvm.vector.reduce.fmul.nxv2f32(float, <vscale x 2 x float>) +declare float @llvm.vector.reduce.fmul.nxv4f32(float, <vscale x 4 x float>) +declare double @llvm.vector.reduce.fmul.nxv2f64(double, <vscale x 2 x double>) diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll index be936f0..6fb0315 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll @@ -369,6 +369,131 @@ define i64 @smax_nxv2i64(<vscale x 2 x i64> %a) { ret i64 %res } +; No MULV instruction so use knowledge about the architectural maximum size of +; an SVE register to "scalarise" the reduction. + +define i8 @mulv_nxv16i8(<vscale x 16 x i8> %a) { +; CHECK-LABEL: mulv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.b, #1 // =0x1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.mul.nxv16i8(<vscale x 16 x i8> %a) + ret i8 %res +} + +define i16 @mulv_nxv8i16(<vscale x 8 x i16> %a) { +; CHECK-LABEL: mulv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, #1 // =0x1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.mul.nxv8i16(<vscale x 8 x i16> %a) + ret i16 %res +} + +define i32 @mulv_nxv4i32(<vscale x 4 x i32> %a) { +; CHECK-LABEL: mulv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %a) + ret i32 %res +} + +define i64 @mulv_nxv2i64(<vscale x 2 x i64> %a) { +; CHECK-LABEL: mulv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.mul.nxv2i64(<vscale x 2 x i64> %a) + ret i64 %res +} + ; Test widen vector reduce type declare i8 @llvm.vector.reduce.smin.nxv10i8(<vscale x 10 x i8>) diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index 6b09424..eee232a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -49,7 +49,6 @@ bb: ret void } -; FIXME: This generates "instid1(/* invalid instid value */)". define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) { ; GFX11-LABEL: f2: ; GFX11: ; %bb.0: ; %bb diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index b07dec3..689d147 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -6,1153 +6,1147 @@ define amdgpu_kernel void @largeInterleave() #0 { ret void } ; GCN-LABEL: largeInterleave: ; GCN: ; %bb.0: - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: ; implicit-def: $vgpr0 - ; GCN-NEXT: ; implicit-def: $vgpr2 - ; GCN-NEXT: ; implicit-def: $vgpr1 - ; GCN-NEXT: ; implicit-def: $vgpr8 - ; GCN-NEXT: ; implicit-def: $vgpr94 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ; implicit-def: $vgpr106 - ; GCN-NEXT: ; implicit-def: $vgpr132 - ; GCN-NEXT: ; implicit-def: $vgpr133 - ; GCN-NEXT: ; implicit-def: $vgpr139 - ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 - ; GCN-NEXT: ; iglp_opt mask(0x00000002) - ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: ; implicit-def: $vgpr16 + ; GCN-NEXT: ; implicit-def: $vgpr25 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) - ; GCN-NEXT: v_readfirstlane_b32 s7, v0 + ; GCN-NEXT: v_readfirstlane_b32 s17, v16 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr17 + ; GCN-NEXT: ; implicit-def: $sgpr15 ; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: ; implicit-def: $sgpr5 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_lshl_add_u32 v0, s7, 4, v2 - ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 - ; GCN-NEXT: v_add_lshl_u32 v92, v0, v1, 1 - ; GCN-NEXT: v_add_u32_e32 v93, s0, v92 - ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: s_lshl_b32 s18, s17, 7 + ; GCN-NEXT: ; implicit-def: $vgpr18 + ; GCN-NEXT: v_add_lshl_u32 v230, v18, s18, 1 + ; GCN-NEXT: v_lshl_add_u32 v25, s17, 4, v25 + ; GCN-NEXT: v_mul_lo_u32 v25, v25, s6 + ; GCN-NEXT: v_add_lshl_u32 v226, v25, v17, 1 + ; GCN-NEXT: v_add_u32_e32 v17, s15, v226 + ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v17, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: s_lshl_b32 s0, s7, 7 - ; GCN-NEXT: v_add_lshl_u32 v95, v8, s0, 1 - ; GCN-NEXT: v_add_u32_e32 v8, 64, v93 - ; GCN-NEXT: ; kill: killed $vgpr8 + ; GCN-NEXT: v_add_u32_e32 v72, 64, v17 + ; GCN-NEXT: ; implicit-def: $vgpr213 + ; GCN-NEXT: ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155 + ; GCN-NEXT: ; implicit-def: $vgpr246 + ; GCN-NEXT: v_add_u32_e32 v188, 0x80, v17 + ; GCN-NEXT: ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159 + ; GCN-NEXT: ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147 + ; GCN-NEXT: ; implicit-def: $vgpr19 + ; GCN-NEXT: ; implicit-def: $vgpr26 + ; GCN-NEXT: ; implicit-def: $vgpr27 + ; GCN-NEXT: v_add_u32_e32 v227, 0xc0, v17 + ; GCN-NEXT: v_add_u32_e32 v231, v19, v26 + ; GCN-NEXT: v_add_u32_e32 v232, v19, v27 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; kill: killed $vgpr92 - ; GCN-NEXT: ; implicit-def: $sgpr6 + ; GCN-NEXT: ; implicit-def: $vgpr28 + ; GCN-NEXT: ; implicit-def: $vgpr29 + ; GCN-NEXT: v_add_u32_e32 v233, v19, v28 + ; GCN-NEXT: v_add_u32_e32 v234, v19, v29 + ; GCN-NEXT: ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: ; implicit-def: $sgpr7 + ; GCN-NEXT: ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151 + ; GCN-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139 + ; GCN-NEXT: ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135 + ; GCN-NEXT: ; implicit-def: $vgpr20 + ; GCN-NEXT: v_add_u32_e32 v18, s17, v20 + ; GCN-NEXT: v_and_b32_e32 v18, 0x1fffffff, v18 + ; GCN-NEXT: ; implicit-def: $sgpr16 + ; GCN-NEXT: v_mul_lo_u32 v18, v18, s16 + ; GCN-NEXT: ; implicit-def: $vgpr21 + ; GCN-NEXT: v_add_lshl_u32 v199, v21, v18, 1 + ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: v_lshl_add_u32 v200, v22, 1, v199 + ; GCN-NEXT: ; implicit-def: $vgpr23 + ; GCN-NEXT: v_lshl_add_u32 v201, v23, 1, v200 + ; GCN-NEXT: ; implicit-def: $vgpr24 + ; GCN-NEXT: v_lshl_add_u32 v202, v24, 1, v201 + ; GCN-NEXT: ; implicit-def: $vgpr16 + ; GCN-NEXT: ; implicit-def: $vgpr18 + ; GCN-NEXT: ; implicit-def: $vgpr20 + ; GCN-NEXT: ; implicit-def: $vgpr24 + ; GCN-NEXT: v_add_u32_e32 v247, v19, v24 + ; GCN-NEXT: v_add_u32_e32 v248, v19, v16 + ; GCN-NEXT: v_add_u32_e32 v249, v19, v18 + ; GCN-NEXT: v_add_u32_e32 v250, v19, v20 + ; GCN-NEXT: ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131 + ; GCN-NEXT: ; implicit-def: $sgpr14 + ; GCN-NEXT: ; implicit-def: $vgpr196 + ; GCN-NEXT: ; implicit-def: $sgpr12_sgpr13 + ; GCN-NEXT: ; implicit-def: $vgpr211 + ; GCN-NEXT: v_max_f32_e32 v212, v211, v211 + ; GCN-NEXT: ; implicit-def: $vgpr198 + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr32 + ; GCN-NEXT: ; implicit-def: $vgpr33 + ; GCN-NEXT: ; implicit-def: $vgpr34 + ; GCN-NEXT: v_add_u32_e32 v210, v19, v34 + ; GCN-NEXT: v_add_u32_e32 v206, v19, v33 + ; GCN-NEXT: v_add_u32_e32 v205, v19, v32 + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN-NEXT: ; implicit-def: $vgpr21 + ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: ; implicit-def: $vgpr23 + ; GCN-NEXT: ; implicit-def: $vgpr30 + ; GCN-NEXT: ; implicit-def: $vgpr31 + ; GCN-NEXT: v_add_u32_e32 v207, v19, v21 + ; GCN-NEXT: v_add_u32_e32 v208, v19, v22 + ; GCN-NEXT: v_add_u32_e32 v209, v19, v23 + ; GCN-NEXT: v_add_u32_e32 v203, v19, v30 + ; GCN-NEXT: v_add_u32_e32 v204, v19, v31 + ; GCN-NEXT: ; kill: killed $vgpr17 + ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; GCN-NEXT: ; implicit-def: $vgpr197 + ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[0:3] + ; GCN-NEXT: ds_write_b128 v230, v[64:67] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[4:7] offset:1024 + ; GCN-NEXT: ds_write_b128 v230, v[68:71] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[64:67], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127] + ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111] + ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0 - ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[168:171], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: ds_read_b128 v[172:175], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[84:87], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[180:183], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:1536 + ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_add_u32_e32 v72, 0x80, v93 + ; GCN-NEXT: ds_write_b128 v230, v[160:163] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v230, v[164:167] offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:128 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v188, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[188:191], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: ds_read_b128 v[192:195], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[164:167], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[214:217], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127] + ; GCN-NEXT: ds_read_b128 v[218:221], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[222:225], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[168:171], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[188:189], v[144:145], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[190:191], v[146:147], v[112:127] + ; GCN-NEXT: ds_read_b128 v[188:191], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: ds_write_b128 v230, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 - ; GCN-NEXT: ; implicit-def: $vgpr64 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 - ; GCN-NEXT: ; implicit-def: $vgpr73 - ; GCN-NEXT: v_add_u32_e32 v76, v132, v64 + ; GCN-NEXT: ds_write_b128 v230, v[160:163] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79] + ; GCN-NEXT: buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v73 - ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr74 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v74 - ; GCN-NEXT: ; implicit-def: $vgpr75 - ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v75 - ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79] + ; GCN-NEXT: v_perm_b32 v238, v162, v160, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127] + ; GCN-NEXT: v_perm_b32 v240, v162, v160, s7 + ; GCN-NEXT: v_perm_b32 v242, v163, v161, s5 + ; GCN-NEXT: v_perm_b32 v244, v163, v161, s7 + ; GCN-NEXT: ds_read_b128 v[160:163], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr76 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ; implicit-def: $sgpr8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512 + ; GCN-NEXT: v_perm_b32 v239, v174, v172, s5 + ; GCN-NEXT: v_perm_b32 v241, v174, v172, s7 + ; GCN-NEXT: v_perm_b32 v243, v175, v173, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79] + ; GCN-NEXT: v_perm_b32 v245, v175, v173, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127] + ; GCN-NEXT: ds_read_b128 v[218:221], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[172:175], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[146:147], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[178:179], v[158:159], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[148:149], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[140:141], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[192:193], v[144:145], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[150:151], v[112:127] + ; GCN-NEXT: ds_read_b128 v[160:163], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: ds_read_b128 v[184:187], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[214:217], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[142:143], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[146:147], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[148:149], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[156:157], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[140:141], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[150:151], v[64:79] + ; GCN-NEXT: ds_read_b128 v[160:163], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: ds_write_b128 v230, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v230, v[226:229] offset:1024 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[158:159], v[80:95] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[64:67], v94 + ; GCN-NEXT: ds_read_b128 v[156:159], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[90:93], v94 offset:512 + ; GCN-NEXT: ds_read_b128 v[226:229], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71 - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[180:183], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ds_read_b128 v[76:79], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[152:155], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[94:97], v106 + ; GCN-NEXT: ds_read_b128 v[230:233], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63] - ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47] - ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[234:237], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127] + ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[224:225], v[142:143], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[148:149], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127] + ; GCN-NEXT: ds_read_b128 v[156:159], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63] - ; GCN-NEXT: v_perm_b32 v94, v102, v98, s5 - ; GCN-NEXT: v_perm_b32 v98, v102, v98, s8 - ; GCN-NEXT: v_perm_b32 v102, v103, v99, s5 - ; GCN-NEXT: v_perm_b32 v95, v104, v100, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63] - ; GCN-NEXT: v_perm_b32 v96, v103, v99, s8 - ; GCN-NEXT: v_perm_b32 v99, v104, v100, s8 - ; GCN-NEXT: v_perm_b32 v103, v105, v101, s5 - ; GCN-NEXT: v_perm_b32 v97, v105, v101, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47] - ; GCN-NEXT: s_nop 5 - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v48 - ; GCN-NEXT: v_mul_f32_e32 v101, s4, v49 - ; GCN-NEXT: v_max3_f32 v92, v100, s6, v101 - ; GCN-NEXT: v_mul_f32_e32 v93, s4, v50 - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v51 - ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 - ; GCN-NEXT: v_mul_f32_e32 v93, s4, v52 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31] - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v53 - ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v54 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v55 - ; GCN-NEXT: v_max3_f32 v84, v92, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v56 - ; GCN-NEXT: v_mul_f32_e32 v92, s4, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15] - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v92 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v58 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v59 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v60 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v61 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v62 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v63 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31] - ; GCN-NEXT: s_nop 6 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v32 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v33 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v34 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v35 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v37 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v38 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v39 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v40 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v41 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31] - ; GCN-NEXT: v_max3_f32 v80, v84, v85, v80 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v42 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v43 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v44 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v45 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v46 - ; GCN-NEXT: v_mul_f32_e32 v82, s4, v47 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v16 - ; GCN-NEXT: v_mul_f32_e32 v82, s4, v17 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 - ; GCN-NEXT: v_max3_f32 v68, v80, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v20 - ; GCN-NEXT: v_mul_f32_e32 v76, s4, v21 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v76 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v22 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v23 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v24 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v25 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v26 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v27 - ; GCN-NEXT: v_max3_f32 v64, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v28 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v29 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v30 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v31 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v0 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v2 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v3 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v4 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v5 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v7 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v8 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v10 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v11 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v12 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v13 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v14 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v15 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: ; implicit-def: $vgpr65 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $vgpr68 - ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_add_u32_e32 v65, s7, v65 - ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65 - ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6 - ; GCN-NEXT: v_add_lshl_u32 v135, v66, v65, 1 - ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_lshl_add_u32 v136, v66, 1, v135 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_lshl_add_u32 v137, v66, 1, v136 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 - ; GCN-NEXT: v_lshl_add_u32 v138, v66, 1, v137 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v135, v[94:95] - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v64, v64, v65 - ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64 + ; GCN-NEXT: ds_write_b64 v199, v[238:239] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[98:99] + ; GCN-NEXT: ds_write_b64 v200, v[240:241] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[102:103] + ; GCN-NEXT: ds_write_b64 v201, v[242:243] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[96:97] - ; GCN-NEXT: v_add_u32_e32 v68, v132, v68 - ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7] - ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 - ; GCN-NEXT: ; implicit-def: $vgpr65 - ; GCN-NEXT: v_max_f32_e32 v66, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v134, v66, v64 - ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: ds_write_b64 v202, v[244:245] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v64 - ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[150:151], v[96:111] + ; GCN-NEXT: buffer_load_dwordx2 v[194:195], v248, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v66 - ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[218:219], v249, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v67 - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[220:221], v250, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 - ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134 - ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 - ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134 - ; GCN-NEXT: v_exp_f32_e32 v163, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 - ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134 - ; GCN-NEXT: v_exp_f32_e32 v164, v57 - ; GCN-NEXT: v_exp_f32_e32 v49, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134 - ; GCN-NEXT: v_exp_f32_e32 v50, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v66 - ; GCN-NEXT: v_fma_f32 v68, s4, v52, -v134 - ; GCN-NEXT: v_exp_f32_e32 v51, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_fma_f32 v69, s4, v53, -v134 - ; GCN-NEXT: v_exp_f32_e32 v52, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v68 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_fma_f32 v70, s4, v54, -v134 - ; GCN-NEXT: v_exp_f32_e32 v53, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v69 - ; GCN-NEXT: v_fma_f32 v71, s4, v55, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v54, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70 - ; GCN-NEXT: v_exp_f32_e32 v55, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71 - ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134 - ; GCN-NEXT: v_exp_f32_e32 v56, v48 - ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49 - ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50 - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51 - ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 - ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v48, v48 - ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58 - ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67 - ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66 - ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55 - ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56 - ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 - ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 - ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] - ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48 - ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48 - ; GCN-NEXT: v_mul_f32_e64 v84, v84, v48 - ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48 - ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48 - ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48 - ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 - ; GCN-NEXT: v_exp_f32_e32 v58, v58 - ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95] - ; GCN-NEXT: v_mul_f32_e64 v98, v98, v48 - ; GCN-NEXT: v_mul_f32_e64 v99, v99, v48 - ; GCN-NEXT: v_mul_f32_e64 v100, v100, v48 - ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48 - ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48 - ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48 - ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59 - ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53 - ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54 - ; GCN-NEXT: v_exp_f32_e32 v59, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111] - ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134 - ; GCN-NEXT: v_mul_f32_e64 v112, v112, v48 - ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48 - ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48 - ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48 - ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134 - ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134 - ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60 - ; GCN-NEXT: ; implicit-def: $vgpr57 - ; GCN-NEXT: ds_read_b128 v[60:63], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v160, v149 - ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148 - ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79] - ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134 - ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134 - ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134 - ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162 - ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163 - ; GCN-NEXT: v_exp_f32_e32 v162, v146 - ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164 - ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134 - ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111] - ; GCN-NEXT: v_exp_f32_e32 v151, v33 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59 - ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 - ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134 - ; GCN-NEXT: v_exp_f32_e32 v153, v33 - ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134 - ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5 - ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161 - ; GCN-NEXT: v_exp_f32_e32 v165, v60 - ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8 - ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v161, v61 - ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8 - ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5 - ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8 - ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8 + ; GCN-NEXT: v_perm_b32 v188, v194, v192, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[164:165], v[144:145], v[80:95] + ; GCN-NEXT: v_perm_b32 v189, v220, v218, s5 + ; GCN-NEXT: v_perm_b32 v191, v220, v218, s7 + ; GCN-NEXT: v_perm_b32 v190, v194, v192, s7 + ; GCN-NEXT: v_perm_b32 v192, v195, v193, s5 + ; GCN-NEXT: v_perm_b32 v194, v195, v193, s7 + ; GCN-NEXT: v_perm_b32 v193, v221, v219, s5 + ; GCN-NEXT: v_perm_b32 v195, v221, v219, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[136:137], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[150:151], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[138:139], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[176:177], v[136:137], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[226:227], v[132:133], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[178:179], v[138:139], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[136:137], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[128:129], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[134:135], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[132:133], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[138:139], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[232:233], v[130:131], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[234:235], v[128:129], v[96:111] + ; GCN-NEXT: s_nop 9 + ; GCN-NEXT: v_mul_f32_e32 v213, s4, v112 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v113 + ; GCN-NEXT: v_max3_f32 v213, v213, s14, v218 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v114 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v115 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v116 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v117 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v118 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v119 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v120 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v121 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79] + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v122 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v123 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v124 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v125 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111] + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v126 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v127 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95] + ; GCN-NEXT: s_nop 6 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v96 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v97 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v98 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v99 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v100 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v101 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v102 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v103 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v104 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95] + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v106 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v107 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v108 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v109 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v110 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v111 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v140, s4, v80 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v81 + ; GCN-NEXT: v_max3_f32 v140, v213, v140, v141 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v82 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v83 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v84 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v85 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v86 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v87 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v88 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v89 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v90 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v91 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v92 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v93 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v94 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v95 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v128, s4, v64 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v65 + ; GCN-NEXT: v_max3_f32 v128, v140, v128, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v66 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v67 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v68 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v69 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v70 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v71 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v72 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v73 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v74 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v75 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v76 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v77 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v78 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v79 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_max_f32_e32 v129, v129, v129 + ; GCN-NEXT: v_max_f32_e32 v128, v128, v129 + ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v128, v129, v128, s[12:13] + ; GCN-NEXT: v_max_f32_e32 v128, v128, v128 + ; GCN-NEXT: v_max_f32_e32 v128, v212, v128 + ; GCN-NEXT: v_fma_f32 v113, s4, v113, -v128 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v114, -v128 + ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v115, -v128 + ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v116, -v128 + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v117, -v128 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v118, -v128 + ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v119, -v128 + ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128 + ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112 + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120 + ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128 + ; GCN-NEXT: v_exp_f32_e32 v114, v138 + ; GCN-NEXT: v_exp_f32_e32 v115, v139 + ; GCN-NEXT: v_exp_f32_e32 v116, v140 + ; GCN-NEXT: v_exp_f32_e32 v117, v141 + ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v118 + ; GCN-NEXT: v_exp_f32_e32 v118, v142 + ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v120 + ; GCN-NEXT: v_exp_f32_e32 v120, v144 + ; GCN-NEXT: v_exp_f32_e32 v113, v112 + ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114 + ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116 + ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v122, s4, v123, -v128 + ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119 + ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115 + ; GCN-NEXT: v_mul_f32_e32 v151, 0x3fb8aa3b, v122 + ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117 + ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128 + ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121 + ; GCN-NEXT: v_exp_f32_e32 v112, v129 + ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122 + ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128 + ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v119, v143 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47] + ; GCN-NEXT: v_mul_f32_e64 v20, v20, v112 + ; GCN-NEXT: v_mul_f32_e64 v21, v21, v112 + ; GCN-NEXT: v_mul_f32_e64 v22, v22, v112 + ; GCN-NEXT: v_mul_f32_e64 v23, v23, v112 + ; GCN-NEXT: v_mul_f32_e64 v24, v24, v112 + ; GCN-NEXT: v_mul_f32_e64 v25, v25, v112 + ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pack_b32_f16 v134, v123, v124 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v119 + ; GCN-NEXT: v_fma_f32 v124, s4, v126, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v120 + ; GCN-NEXT: v_exp_f32_e32 v121, v148 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v122, v149 + ; GCN-NEXT: v_pack_b32_f16 v135, v130, v126 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v124 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v121 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125 + ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128 + ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v123, v150 + ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_fma_f32 v143, s4, v101, -v128 + ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128 + ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128 + ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128 + ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v124, v151 + ; GCN-NEXT: ds_read_b128 v[130:133], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v122 + ; GCN-NEXT: v_exp_f32_e32 v96, v129 + ; GCN-NEXT: v_fma_f32 v137, s4, v97, -v128 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v97, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_fma_f32 v137, s4, v98, -v128 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v124 + ; GCN-NEXT: v_fma_f32 v135, s4, v99, -v128 + ; GCN-NEXT: v_exp_f32_e32 v98, v138 + ; GCN-NEXT: v_exp_f32_e32 v99, v127 + ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_pack_b32_f16 v127, v136, v134 + ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15] + ; GCN-NEXT: v_fma_f32 v131, s4, v100, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v96 + ; GCN-NEXT: v_exp_f32_e32 v100, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v97 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111] - ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151 - ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153 - ; GCN-NEXT: v_exp_f32_e32 v159, v33 - ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38 - ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_exp_f32_e32 v152, v38 + ; GCN-NEXT: ds_write_b64 v199, v[188:189] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[60:61] + ; GCN-NEXT: ds_write_b64 v200, v[190:191] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[32:33] - ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: ; implicit-def: $vgpr38 + ; GCN-NEXT: ds_write_b64 v201, v[192:193] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[140:141] - ; GCN-NEXT: v_add_u32_e32 v38, v132, v38 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v33 + ; GCN-NEXT: ds_write_b64 v202, v[194:195] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v101, v125 + ; GCN-NEXT: v_pack_b32_f16 v146, v130, v131 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v98 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31] + ; GCN-NEXT: v_fma_f32 v134, s4, v102, -v128 + ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v134 + ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr36 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v36 - ; GCN-NEXT: ; implicit-def: $vgpr37 - ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v102, v142 + ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v37 - ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165 - ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156 - ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134 - ; GCN-NEXT: ds_read_b128 v[36:39], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v154, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 - ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134 - ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v155, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157 - ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161 - ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159 - ; GCN-NEXT: v_exp_f32_e32 v157, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142 - ; GCN-NEXT: v_exp_f32_e32 v146, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134 - ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40 - ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v147, v36 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v143, v36 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142 - ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157 - ; GCN-NEXT: v_exp_f32_e32 v156, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146 - ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32 - ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v129, v36 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147 - ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: ds_read_b128 v[36:39], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v142, v40 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143 - ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95] - ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v63, v40 - ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61 - ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134 - ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156 - ; GCN-NEXT: v_exp_f32_e32 v158, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129 - ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v128, v17 - ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8 - ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62 - ; GCN-NEXT: v_exp_f32_e32 v167, v36 - ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8 - ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v130, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158 - ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5 - ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v99 + ; GCN-NEXT: v_fma_f32 v127, s4, v103, -v128 + ; GCN-NEXT: v_exp_f32_e32 v103, v150 + ; GCN-NEXT: v_fma_f32 v139, s4, v105, -v128 + ; GCN-NEXT: v_pack_b32_f16 v147, v147, v126 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_perm_b32 v152, v135, v131, s5 + ; GCN-NEXT: v_perm_b32 v154, v135, v131, s7 + ; GCN-NEXT: v_fma_f32 v135, s4, v104, -v128 + ; GCN-NEXT: v_perm_b32 v126, v134, v130, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15] + ; GCN-NEXT: v_perm_b32 v150, v134, v130, s7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v100 + ; GCN-NEXT: v_exp_f32_e32 v104, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v101 + ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_perm_b32 v127, v144, v142, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47] + ; GCN-NEXT: v_pack_b32_f16 v148, v134, v135 + ; GCN-NEXT: v_fma_f32 v135, s4, v106, -v128 + ; GCN-NEXT: v_exp_f32_e32 v105, v125 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v102 + ; GCN-NEXT: v_perm_b32 v151, v144, v142, s7 + ; GCN-NEXT: v_perm_b32 v153, v145, v143, s5 + ; GCN-NEXT: v_perm_b32 v155, v145, v143, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v106, v156 + ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v103 + ; GCN-NEXT: v_fma_f32 v136, s4, v107, -v128 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_pack_b32_f16 v149, v134, v135 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63] + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v136 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v107, v138 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[148:149], v[0:15] + ; GCN-NEXT: v_fma_f32 v131, s4, v108, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v104 + ; GCN-NEXT: v_exp_f32_e32 v108, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47] + ; GCN-NEXT: v_fma_f32 v142, s4, v109, -v128 + ; GCN-NEXT: v_exp_f32_e32 v109, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v142 + ; GCN-NEXT: v_pack_b32_f16 v142, v130, v131 + ; GCN-NEXT: v_fma_f32 v131, s4, v110, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31] + ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v107 + ; GCN-NEXT: v_exp_f32_e32 v110, v156 + ; GCN-NEXT: v_fma_f32 v135, s4, v111, -v128 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_pack_b32_f16 v143, v130, v131 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v111, v146 + ; GCN-NEXT: v_fma_f32 v139, s4, v80, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v108 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v80, v129 + ; GCN-NEXT: ds_read_b128 v[130:133], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v109 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47] + ; GCN-NEXT: v_fma_f32 v144, s4, v81, -v128 + ; GCN-NEXT: v_exp_f32_e32 v81, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v144 + ; GCN-NEXT: v_pack_b32_f16 v144, v138, v139 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v110 + ; GCN-NEXT: v_fma_f32 v137, s4, v82, -v128 + ; GCN-NEXT: v_exp_f32_e32 v82, v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v111 + ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_fma_f32 v137, s4, v83, -v128 + ; GCN-NEXT: v_mul_f32_e32 v157, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v83, v135 + ; GCN-NEXT: v_pack_b32_f16 v145, v136, v134 + ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[20:21] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111] - ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5 - ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v140, v17 - ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5 + ; GCN-NEXT: ds_write_b64 v199, v[126:127] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60 - ; GCN-NEXT: v_exp_f32_e32 v144, v22 + ; GCN-NEXT: ds_write_b64 v200, v[150:151] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[16:17] - ; GCN-NEXT: ; implicit-def: $vgpr17 - ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: ds_write_b64 v201, v[152:153] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[42:43] - ; GCN-NEXT: v_add_u32_e32 v22, v132, v22 - ; GCN-NEXT: v_add_u32_e32 v17, v132, v17 - ; GCN-NEXT: ; implicit-def: $vgpr20 - ; GCN-NEXT: ; implicit-def: $vgpr21 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: ds_write_b64 v202, v[154:155] + ; GCN-NEXT: v_fma_f32 v127, s4, v84, -v128 + ; GCN-NEXT: v_exp_f32_e32 v84, v129 + ; GCN-NEXT: v_fma_f32 v130, s4, v85, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v80 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v85, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v130 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v81 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31] + ; GCN-NEXT: v_fma_f32 v134, s4, v86, -v128 + ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v134 + ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v203, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v20, v132, v20 - ; GCN-NEXT: v_add_u32_e32 v21, v132, v21 - ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44 - ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v132, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v82 + ; GCN-NEXT: v_exp_f32_e32 v86, v156 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v83 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167 - ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134 - ; GCN-NEXT: ds_read_b128 v[20:23], v139 + ; GCN-NEXT: v_fma_f32 v139, s4, v87, -v128 + ; GCN-NEXT: v_exp_f32_e32 v87, v157 + ; GCN-NEXT: v_pack_b32_f16 v127, v127, v138 + ; GCN-NEXT: v_fma_f32 v138, s4, v89, -v128 + ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15] + ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: v_perm_b32 v154, v135, v131, s5 + ; GCN-NEXT: v_perm_b32 v156, v135, v131, s7 + ; GCN-NEXT: v_fma_f32 v135, s4, v88, -v128 + ; GCN-NEXT: v_perm_b32 v150, v134, v130, s5 + ; GCN-NEXT: v_perm_b32 v152, v134, v130, s7 + ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v84 + ; GCN-NEXT: v_exp_f32_e32 v88, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v85 + ; GCN-NEXT: v_perm_b32 v151, v146, v142, s5 + ; GCN-NEXT: v_perm_b32 v153, v146, v142, s7 + ; GCN-NEXT: v_perm_b32 v155, v147, v143, s5 + ; GCN-NEXT: v_perm_b32 v157, v147, v143, s7 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v89, v125 + ; GCN-NEXT: v_pack_b32_f16 v146, v134, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v86 + ; GCN-NEXT: v_fma_f32 v135, s4, v90, -v128 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v138 + ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v90, v158 + ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v87 + ; GCN-NEXT: v_fma_f32 v127, s4, v91, -v128 + ; GCN-NEXT: v_exp_f32_e32 v91, v139 + ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_pack_b32_f16 v147, v134, v126 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] + ; GCN-NEXT: v_fma_f32 v130, s4, v92, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v88 + ; GCN-NEXT: v_exp_f32_e32 v92, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v130 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v89 + ; GCN-NEXT: v_fma_f32 v131, s4, v93, -v128 + ; GCN-NEXT: v_pack_b32_f16 v130, v126, v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v93, v125 + ; GCN-NEXT: v_fma_f32 v126, s4, v94, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v125, v90 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v126 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v91 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_fma_f32 v131, s4, v95, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v94, v148 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v95, v127 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v92 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_pack_b32_f16 v131, v125, v126 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v125, v129 + ; GCN-NEXT: ds_read_b128 v[132:135], v197 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[36:39], v139 offset:576 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_fma_f32 v65, s4, v66, -v128 + ; GCN-NEXT: v_exp_f32_e32 v126, v142 + ; GCN-NEXT: v_pack_b32_f16 v142, v127, v64 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v94 + ; GCN-NEXT: v_mul_f32_e32 v145, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v95 + ; GCN-NEXT: v_fma_f32 v66, s4, v67, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v127, v143 + ; GCN-NEXT: v_pack_b32_f16 v143, v64, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v129, v138 + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v66 + ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[136:139], v197 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v62, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130 - ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134 - ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134 - ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134 - ; GCN-NEXT: ; implicit-def: $sgpr0 - ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140 - ; GCN-NEXT: v_exp_f32_e32 v145, v16 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141 - ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46 - ; GCN-NEXT: v_exp_f32_e32 v35, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v34, s4, v27, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v24 - ; GCN-NEXT: ds_read_b128 v[24:27], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v46, v20 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v47 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v132 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v47, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v62 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v34 - ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145 - ; GCN-NEXT: v_exp_f32_e32 v141, v16 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35 - ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134 - ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16 - ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v33, v20 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: ds_read_b128 v[20:23], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v36, v24 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47 - ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95] - ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_exp_f32_e32 v39, v24 - ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37 - ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141 - ; GCN-NEXT: v_exp_f32_e32 v148, v1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33 - ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127] - ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134 - ; GCN-NEXT: v_exp_f32_e32 v34, v1 - ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8 - ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38 - ; GCN-NEXT: v_exp_f32_e32 v150, v20 - ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v38, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39 - ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134 - ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5 - ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[4:5] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111] - ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5 - ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_exp_f32_e32 v42, v1 - ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5 + ; GCN-NEXT: ds_write_b64 v199, v[150:151] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[20:21] + ; GCN-NEXT: ds_write_b64 v200, v[152:153] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v125 + ; GCN-NEXT: v_exp_f32_e32 v130, v158 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[0:1] + ; GCN-NEXT: ds_write_b64 v201, v[154:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[26:27] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6 - ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32 + ; GCN-NEXT: ds_write_b64 v202, v[156:157] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_pack_b32_f16 v16, v37, v28 - ; GCN-NEXT: v_fma_f32 v24, s4, v7, -v134 - ; GCN-NEXT: v_exp_f32_e32 v25, v6 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[4:7], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149 - ; GCN-NEXT: v_exp_f32_e32 v26, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150 - ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38 - ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v28, s4, v9, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v29, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v41 - ; GCN-NEXT: v_fma_f32 v30, s4, v10, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v42 - ; GCN-NEXT: v_exp_f32_e32 v31, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v17, v2, v0 - ; GCN-NEXT: v_pack_b32_f16 v16, v1, v27 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24 - ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134 - ; GCN-NEXT: v_exp_f32_e32 v19, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8 - ; GCN-NEXT: ds_read_b128 v[8:11], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v24, v4 - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v26 - ; GCN-NEXT: v_exp_f32_e32 v27, v4 - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95] - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 - ; GCN-NEXT: v_fma_f32 v21, s4, v13, -v134 - ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31 - ; GCN-NEXT: v_exp_f32_e32 v30, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 - ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v16, v4 - ; GCN-NEXT: v_pack_b32_f16 v0, v5, v20 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v12 - ; GCN-NEXT: v_exp_f32_e32 v18, v9 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21 - ; GCN-NEXT: v_exp_f32_e32 v21, v9 - ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: ds_read_b128 v[4:7], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95] - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30 - ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_exp_f32_e32 v2, v2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127] - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_exp_f32_e32 v10, v1 - ; GCN-NEXT: v_pack_b32_f16 v8, v17, v20 - ; GCN-NEXT: v_pack_b32_f16 v9, v3, v0 - ; GCN-NEXT: v_add_f32_e32 v3, 0, v49 - ; GCN-NEXT: v_add_f32_e32 v3, v50, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v51, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v52, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v53, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v54, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v55, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v56, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v58, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v164, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v59, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v162, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v151, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v165, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v161, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v159, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v152, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v154, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v157, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v147, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v156, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v63, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v158, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v128, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v130, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v140, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v144, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v132, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v62, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v145, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v35, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v46, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v47, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v33, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v36, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v39, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95] - ; GCN-NEXT: v_add_f32_e32 v3, v34, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 - ; GCN-NEXT: v_add_f32_e32 v3, v38, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v42, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v25, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v26, v3 - ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22 - ; GCN-NEXT: v_add_f32_e32 v3, v29, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v31, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95] - ; GCN-NEXT: v_add_f32_e32 v3, v19, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v24, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v27, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v30, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v16, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v18, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v21, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: v_add_f32_e32 v0, v2, v3 - ; GCN-NEXT: v_add_f32_e32 v4, v10, v0 - ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v126 + ; GCN-NEXT: v_exp_f32_e32 v131, v144 + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_fma_f32 v69, s4, v71, -v128 + ; GCN-NEXT: v_pack_b32_f16 v140, v132, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v129 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v127 + ; GCN-NEXT: v_exp_f32_e32 v132, v145 + ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128 + ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_fma_f32 v145, s4, v73, -v128 + ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v145 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v133, v141 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_pack_b32_f16 v141, v64, v68 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[68:71], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v143, s4, v72, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v72, v146 + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_cvt_f16_f32_e32 v143, v131 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pack_b32_f16 v64, v64, v143 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v73, v144 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v132 + ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128 + ; GCN-NEXT: v_exp_f32_e32 v74, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v133 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63] + ; GCN-NEXT: v_fma_f32 v138, s4, v75, -v128 + ; GCN-NEXT: v_exp_f32_e32 v75, v142 + ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v138 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] + ; GCN-NEXT: v_fma_f32 v68, s4, v76, -v128 + ; GCN-NEXT: v_exp_f32_e32 v76, v146 + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v73 + ; GCN-NEXT: v_fma_f32 v69, s4, v77, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v77, v147 + ; GCN-NEXT: v_pack_b32_f16 v134, v66, v68 + ; GCN-NEXT: v_fma_f32 v68, s4, v78, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v74 + ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v78, v67 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v76 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v75 + ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v128 + ; GCN-NEXT: v_exp_f32_e32 v79, v148 + ; GCN-NEXT: v_mul_f32_e32 v128, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_pack_b32_f16 v135, v66, v64 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v142, v146 + ; GCN-NEXT: ds_read_b128 v[68:71], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v137, v147 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v77 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v138, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v78 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] + ; GCN-NEXT: s_nop 10 + ; GCN-NEXT: v_exp_f32_e32 v52, v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v142 + ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v52 + ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v79 + ; GCN-NEXT: v_pack_b32_f16 v50, v51, v50 + ; GCN-NEXT: v_pack_b32_f16 v48, v139, v136 + ; GCN-NEXT: v_pack_b32_f16 v51, v54, v53 + ; GCN-NEXT: v_add_f32_e32 v53, 0, v113 + ; GCN-NEXT: v_add_f32_e32 v53, v114, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v115, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v116, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v117, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v118, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v119, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v120, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v121, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v122, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v123, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v124, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v96, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v97, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v98, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v99, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v100, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v101, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v102, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v103, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v104, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v105, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v106, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v107, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v108, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v109, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v110, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v111, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v80, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v81, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v82, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v83, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v84, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v85, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v86, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v87, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v88, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v89, v53 + ; GCN-NEXT: v_pack_b32_f16 v49, v140, v49 + ; GCN-NEXT: v_add_f32_e32 v53, v90, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v91, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15] + ; GCN-NEXT: v_add_f32_e32 v53, v92, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v93, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v94, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v95, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v125, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v126, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v127, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v129, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47] + ; GCN-NEXT: s_nop 9 + ; GCN-NEXT: v_add_f32_e32 v0, v130, v53 + ; GCN-NEXT: v_add_f32_e32 v0, v131, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v132, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v133, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v72, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v73, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v74, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v75, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v76, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v77, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v78, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v79, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v142, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v137, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v138, v0 + ; GCN-NEXT: v_add_f32_e32 v4, v52, v0 + ; GCN-NEXT: ds_bpermute_b32 v5, v196, v4 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31] ; GCN-NEXT: v_add_f32_e32 v2, v4, v5 - ; GCN-NEXT: ds_bpermute_b32 v3, v133, v2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111] - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7] + ; GCN-NEXT: ds_bpermute_b32 v3, v196, v2 ; GCN-NEXT: ; implicit-def: $vgpr4 - ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[12:13] + ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v112 + ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47] ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 7959cee..e174fc1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -156,62 +156,62 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 -; GCN-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[0:3], v1 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v3 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[128:131], v3 offset:8192 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index aa099b6..b65a1a8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -623,62 +623,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[128:131], v1 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: ds_read_b128 a[60:63], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v2 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v2 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v2 offset:57392 -; GCN-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[128:131], v3 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576 +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152 +; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; GCN-NEXT: s_waitcnt lgkmcnt(14) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] ; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; GCN-NEXT: s_nop 12 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; GCN-NEXT: s_nop 11 ; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 @@ -729,62 +729,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:24576 -; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 1.0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v2 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v2 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v2 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v2 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v2 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v2 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v2 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v2 offset:57392 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v3, s0, v0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v3 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v3 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v3 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v3 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v3 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v3 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:24576 +; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:49152 +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; EXACTCUTOFF-NEXT: s_nop 12 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; EXACTCUTOFF-NEXT: s_nop 11 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll index ddbae64..a95d8c7 100644 --- a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll +++ b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll @@ -1,8 +1,8 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck %s -check-prefixes=GCN,GFX700 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900 +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100 +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100 declare i64 @llvm.readsteadycounter() #0 diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 9a23788..8803f3a 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -367,77 +367,76 @@ bb: define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK-LABEL: illegal_mfma_after_rewrite: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[4:5] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[16:19] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, s1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7] -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; CHECK-NEXT: v_mov_b32_e32 v5, v4 -; CHECK-NEXT: v_mov_b32_e32 v6, v4 -; CHECK-NEXT: v_mov_b32_e32 v7, v4 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9] +; CHECK-NEXT: s_nop 3 +; CHECK-NEXT: v_cvt_f16_f32_e32 v24, v4 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3] ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[4:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7] -; CHECK-NEXT: s_nop 5 -; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15] -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3] -; CHECK-NEXT: global_store_short v[12:13], v17, off +; CHECK-NEXT: v_mov_b32_e32 v8, 0x7fc00000 +; CHECK-NEXT: v_mov_b32_e32 v9, v8 +; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v8 +; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v6 +; CHECK-NEXT: v_mov_b64_e32 v[0:1], 0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11] +; CHECK-NEXT: global_store_short v[0:1], v2, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: global_store_short v[12:13], v9, off -; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19] +; CHECK-NEXT: s_nop 5 +; CHECK-NEXT: v_cvt_f16_f32_e32 v10, v6 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15] +; CHECK-NEXT: global_store_short v[0:1], v10, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0 -; CHECK-NEXT: global_store_short v[12:13], v1, off -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23] +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CHECK-NEXT: global_store_short v[0:1], v6, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[12:13], v14, off +; CHECK-NEXT: global_store_short v[0:1], v24, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5] ; CHECK-NEXT: s_nop 6 -; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7] -; CHECK-NEXT: global_store_short v[12:13], v8, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v2 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19] +; CHECK-NEXT: global_store_short v[0:1], v6, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 ; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CHECK-NEXT: global_store_short v[12:13], v0, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CHECK-NEXT: global_store_short v[0:1], v2, off ; CHECK-NEXT: s_endpgm entry: %k0 = call <4 x float> asm sideeffect "; def $0", "=s"() @@ -546,100 +545,14 @@ define void @test_rewrite_mfma_subreg_insert2(double %arg0, double %arg1, ptr ad define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) #0 { ; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class: ; CHECK: ; %bb.0: +; CHECK-NEXT: v_accvgpr_write_b32 a34, 2.0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 -; CHECK-NEXT: v_accvgpr_read_b32 v63, a31 -; CHECK-NEXT: v_accvgpr_read_b32 v62, a30 -; CHECK-NEXT: v_accvgpr_read_b32 v61, a29 -; CHECK-NEXT: v_accvgpr_read_b32 v60, a28 -; CHECK-NEXT: v_accvgpr_read_b32 v59, a27 -; CHECK-NEXT: v_accvgpr_read_b32 v58, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v57, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v56, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v55, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v54, a22 -; CHECK-NEXT: v_accvgpr_read_b32 v53, a21 -; CHECK-NEXT: v_accvgpr_read_b32 v52, a20 -; CHECK-NEXT: v_accvgpr_read_b32 v51, a19 -; CHECK-NEXT: v_accvgpr_read_b32 v50, a18 -; CHECK-NEXT: v_accvgpr_read_b32 v49, a17 -; CHECK-NEXT: v_accvgpr_read_b32 v48, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v47, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v46, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v45, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v44, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v43, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v42, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v41, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v40, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v39, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v38, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v37, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v36, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v35, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v34, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v33, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v32, a0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, 2.0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, 4.0 -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[32:63] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v32 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v33 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v34 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v35 -; CHECK-NEXT: v_accvgpr_write_b32 a4, v36 -; CHECK-NEXT: v_accvgpr_write_b32 a5, v37 -; CHECK-NEXT: v_accvgpr_write_b32 a6, v38 -; CHECK-NEXT: v_accvgpr_write_b32 a7, v39 -; CHECK-NEXT: v_accvgpr_write_b32 a8, v40 -; CHECK-NEXT: v_accvgpr_write_b32 a9, v41 -; CHECK-NEXT: v_accvgpr_write_b32 a10, v42 -; CHECK-NEXT: v_accvgpr_write_b32 a11, v43 -; CHECK-NEXT: v_accvgpr_write_b32 a12, v44 -; CHECK-NEXT: v_accvgpr_write_b32 a13, v45 -; CHECK-NEXT: v_accvgpr_write_b32 a14, v46 -; CHECK-NEXT: v_accvgpr_write_b32 a15, v47 -; CHECK-NEXT: v_accvgpr_write_b32 a16, v48 -; CHECK-NEXT: v_accvgpr_write_b32 a17, v49 -; CHECK-NEXT: v_accvgpr_write_b32 a18, v50 -; CHECK-NEXT: v_accvgpr_write_b32 a19, v51 -; CHECK-NEXT: v_accvgpr_write_b32 a20, v52 -; CHECK-NEXT: v_accvgpr_write_b32 a21, v53 -; CHECK-NEXT: v_accvgpr_write_b32 a22, v54 -; CHECK-NEXT: v_accvgpr_write_b32 a23, v55 -; CHECK-NEXT: v_accvgpr_write_b32 a24, v56 -; CHECK-NEXT: v_accvgpr_write_b32 a25, v57 -; CHECK-NEXT: v_accvgpr_write_b32 a26, v58 -; CHECK-NEXT: v_accvgpr_write_b32 a27, v59 -; CHECK-NEXT: v_accvgpr_write_b32 a28, v60 -; CHECK-NEXT: v_accvgpr_write_b32 a29, v61 -; CHECK-NEXT: v_accvgpr_write_b32 a30, v62 -; CHECK-NEXT: v_accvgpr_write_b32 a31, v63 -; CHECK-NEXT: v_mov_b32_e32 v33, 0x41000000 -; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000 -; CHECK-NEXT: v_accvgpr_read_b32 v32, a32 -; CHECK-NEXT: v_and_b32_e32 v32, 0x3ff, v32 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31] -; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v32 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] -; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 @@ -663,18 +576,60 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: v_accvgpr_read_b32 v21, a21 ; CHECK-NEXT: v_accvgpr_read_b32 v22, a22 ; CHECK-NEXT: v_accvgpr_read_b32 v23, a23 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 ; CHECK-NEXT: v_accvgpr_read_b32 v28, a28 ; CHECK-NEXT: v_accvgpr_read_b32 v29, a29 ; CHECK-NEXT: v_accvgpr_read_b32 v30, a30 ; CHECK-NEXT: v_accvgpr_read_b32 v31, a31 -; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 -; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 -; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 -; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 -; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 -; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] -; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 +; CHECK-NEXT: v_accvgpr_write_b32 a33, 4.0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a34, a33, v[0:31] +; CHECK-NEXT: v_mov_b32_e32 v1, 0x41000000 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a32 +; CHECK-NEXT: s_nop 15 +; CHECK-NEXT: v_mov_b64_e32 v[2:3], v[32:33] +; CHECK-NEXT: v_mov_b64_e32 v[4:5], v[34:35] +; CHECK-NEXT: v_mov_b64_e32 v[6:7], v[36:37] +; CHECK-NEXT: v_mov_b64_e32 v[8:9], v[38:39] +; CHECK-NEXT: v_mov_b64_e32 v[10:11], v[40:41] +; CHECK-NEXT: v_mov_b64_e32 v[12:13], v[42:43] +; CHECK-NEXT: v_mov_b64_e32 v[14:15], v[44:45] +; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[46:47] +; CHECK-NEXT: v_mov_b64_e32 v[18:19], v[48:49] +; CHECK-NEXT: v_mov_b64_e32 v[20:21], v[50:51] +; CHECK-NEXT: v_mov_b64_e32 v[22:23], v[52:53] +; CHECK-NEXT: v_mov_b64_e32 v[24:25], v[54:55] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[56:57] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[58:59] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[60:61] +; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[62:63] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, v[22:25], s[0:1] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, v[18:21], s[0:1] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x41800000 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] +; CHECK-NEXT: s_nop 15 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[2:3] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[2:3] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[2:3] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[2:3] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 ; CHECK-NEXT: s_endpgm %src2 = call <32 x float> asm sideeffect "; def $0", "=a"() %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir new file mode 100644 index 0000000..33b2f69 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir @@ -0,0 +1,32 @@ +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler --misched-prera-direction=topdown -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Check that cycle counts are consistent with hazards. + +# CHECK: Cycle: 3 TopQ.A +# CHECK: hazard: SU(6) HWXDL[0]=9c, is later than CurrCycle = 3c +# CHECK-NOT: Cycle: 9 TopQ.A +# CHECK: Cycle: 83 TopQ.A +# CHECK: Checking pending node SU(6) +# CHECK: Move SU(6) into Available Q + +--- +name: pending_queue_ready_cycle +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr4_sgpr5 + + %2:sgpr_128 = IMPLICIT_DEF + %14:vgpr_32 = IMPLICIT_DEF + %15:vgpr_32 = IMPLICIT_DEF + %18:areg_512 = IMPLICIT_DEF + %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, implicit $exec + %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec + undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %5.sub0, %14, implicit $exec + %7:vreg_512 = COPY %18 + SCHED_BARRIER 0 + S_NOP 0, implicit %18, implicit %7, implicit %84 + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll index 71dcf11..196560f 100644 --- a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll +++ b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll @@ -11,11 +11,11 @@ declare void @f16_user(half) ; CHECK-SAME: in function four64 ; CHECK-SAME: Type mismatch between intrinsic and DXIL op define void @four64() "hlsl.export" { - %buffer = call target("dx.CBuffer", target("dx.Layout", {double}, 8, 0)) + %buffer = call target("dx.CBuffer", <{ double }>) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) %load = call {double, double, double, double} @llvm.dx.resource.load.cbufferrow.4( - target("dx.CBuffer", target("dx.Layout", {double}, 8, 0)) %buffer, + target("dx.CBuffer", <{ double }>) %buffer, i32 0) %data = extractvalue {double, double, double, double} %load, 0 @@ -28,11 +28,11 @@ define void @four64() "hlsl.export" { ; CHECK-SAME: in function two32 ; CHECK-SAME: Type mismatch between intrinsic and DXIL op define void @two32() "hlsl.export" { - %buffer = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) + %buffer = call target("dx.CBuffer", <{ float }>) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) %load = call {float, float} @llvm.dx.resource.load.cbufferrow.2( - target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) %buffer, + target("dx.CBuffer", <{ float }>) %buffer, i32 0) %data = extractvalue {float, float} %load, 0 @@ -41,5 +41,5 @@ define void @two32() "hlsl.export" { ret void } -declare { double, double, double, double } @llvm.dx.resource.load.cbufferrow.4.f64.f64.f64.f64.tdx.CBuffer_tdx.Layout_sl_f64s_8_0tt(target("dx.CBuffer", target("dx.Layout", { double }, 8, 0)), i32) -declare { float, float } @llvm.dx.resource.load.cbufferrow.2.f32.f32.tdx.CBuffer_tdx.Layout_sl_f32s_4_0tt(target("dx.CBuffer", target("dx.Layout", { float }, 4, 0)), i32) +declare { double, double, double, double } @llvm.dx.resource.load.cbufferrow.4.f64.f64.f64.f64.tdx.CBuffer_sl_f64st(target("dx.CBuffer", <{ double }>), i32) +declare { float, float } @llvm.dx.resource.load.cbufferrow.2.f32.f32.tdx.CBuffer_sl_f32st(target("dx.CBuffer", <{ float }>), i32) diff --git a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll index d690651..dd40aa8 100644 --- a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll +++ b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll @@ -8,12 +8,12 @@ declare void @f16_user(half) ; CHECK-LABEL: define void @loadf32 define void @loadf32() { - %buffer = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) + %buffer = call target("dx.CBuffer", <{ float }>) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) ; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %{{.*}}, i32 0) %load = call {float, float, float, float} @llvm.dx.resource.load.cbufferrow.4( - target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) %buffer, + target("dx.CBuffer", <{ float }>) %buffer, i32 0) %data = extractvalue {float, float, float, float} %load, 0 @@ -27,12 +27,12 @@ define void @loadf32() { ; CHECK-LABEL: define void @loadf64 define void @loadf64() { %buffer = call - target("dx.CBuffer", target("dx.Layout", {double, double, double, double}, 64, 0, 8, 16, 24)) + target("dx.CBuffer", <{ <4 x double> }>) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) ; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %{{.*}}, i32 1) %load = call {double, double} @llvm.dx.resource.load.cbufferrow.2( - target("dx.CBuffer", target("dx.Layout", {double, double, double, double}, 64, 0, 8, 16, 24)) %buffer, + target("dx.CBuffer", <{ <4 x double> }>) %buffer, i32 1) %data = extractvalue {double, double} %load, 1 @@ -46,12 +46,12 @@ define void @loadf64() { ; CHECK-LABEL: define void @loadf16 define void @loadf16() { %buffer = call - target("dx.CBuffer", target("dx.Layout", {half}, 2, 0)) + target("dx.CBuffer", <{ half }>) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) ; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %{{.*}}, i32 0) %load = call {half, half, half, half, half, half, half, half} @llvm.dx.resource.load.cbufferrow.8( - target("dx.CBuffer", target("dx.Layout", {half}, 2, 0)) %buffer, + target("dx.CBuffer", <{ half }>) %buffer, i32 0) %data = extractvalue {half, half, half, half, half, half, half, half} %load, 0 diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll index bcf82a6..5cd67be 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll @@ -18,7 +18,7 @@ define void @main() #0 { %srv0 = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t( i32 1, i32 8, i32 1, i32 0, ptr null) - %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) + %cbuf = call target("dx.CBuffer", <{ float }>) @llvm.dx.resource.handlefrombinding(i32 3, i32 2, i32 1, i32 0, ptr null) ret void } diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll index 70224fc..d792078 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll @@ -14,7 +14,7 @@ define void @main() #0 { ; CHECK: Kind: CBuffer ; CHECK: Flags: ; CHECK: UsedByAtomic64: false - %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) + %cbuf = call target("dx.CBuffer", <{ float }>) @llvm.dx.resource.handlefrombinding(i32 3, i32 2, i32 1, i32 0, ptr null) ; ByteAddressBuffer Buf : register(t8, space1) diff --git a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll index 38f2de2..671fcef 100644 --- a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll +++ b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll @@ -72,7 +72,7 @@ define void @test_bindings() { ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF5]], %dx.types.ResourceProperties { i32 10, i32 1033 }) #[[#ATTR]] ; cbuffer cb0 : register(b0) { int4 i; float4 f; } - %cb0 = call target("dx.CBuffer", target("dx.Layout", {<4 x i32>, <4 x float>}, 32, 0, 16)) + %cb0 = call target("dx.CBuffer", <{ <4 x i32>, <4 x float> }>) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) ; CHECK: [[BUF6:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 2 }, i32 0, i1 false) #[[#ATTR]] ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF6]], %dx.types.ResourceProperties { i32 13, i32 32 }) #[[#ATTR]] diff --git a/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll b/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll index 26b157f..d674863 100644 --- a/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll +++ b/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll @@ -4,27 +4,27 @@ %__cblayout_CB2 = type <{ float }> %struct.Scalars = type { float, i32, i32 } -@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) poison -@CB2.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) poison +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison +@CB2.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB2) poison define void @main() local_unnamed_addr #1 { entry: ; CHECK: [[CB:%.*]] = tail call target({{.*}}) @llvm.dx.resource.handlefrombinding - %h = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) - store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) %h, ptr @CB.cb, align 4 + %h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %h, ptr @CB.cb, align 4 %_ZL3Out_h.i.i = tail call target("dx.RawBuffer", %struct.Scalars, 1, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) ; CHECK-NOT: load target({{.*}}), ptr @CB.cb - %cb = load target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)), ptr @CB.cb, align 4 + %cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 ; CHECK: call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target({{.*}}) [[CB]], i32 0) - %0 = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4(target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) %cb, i32 0) + %0 = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4(target("dx.CBuffer", %__cblayout_CB) %cb, i32 0) %1 = extractvalue { float, float, float, float } %0, 0 call void @llvm.dx.resource.store.rawbuffer(target("dx.RawBuffer", %struct.Scalars, 1, 0) %_ZL3Out_h.i.i, i32 0, i32 0, float %1) - + ; CHECK: [[CB2:%.*]] = tail call target({{.*}}) @llvm.dx.resource.handlefromimplicitbinding - %h2 = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) @llvm.dx.resource.handlefromimplicitbinding(i32 100, i32 0, i32 1, i32 0, ptr null) - store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) %h2, ptr @CB2.cb, align 4 + %h2 = tail call target("dx.CBuffer", %__cblayout_CB2) @llvm.dx.resource.handlefromimplicitbinding(i32 100, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB2) %h2, ptr @CB2.cb, align 4 ; CHECK-NOT: load target({{.*}}), ptr @CB2.cb - %cb2 = load target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)), ptr @CB2.cb, align 4 + %cb2 = load target("dx.CBuffer", %__cblayout_CB2), ptr @CB2.cb, align 4 ret void } diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-layouttype.ll index f1d28e2..85952c9 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-layouttype.ll @@ -1,3 +1,6 @@ +; TODO: Remove this test once we've updated the frontend to use explicit +; padding. The cbuffer-metadata.ll test covers the newer logic. + ; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s ; RUN: opt -S --passes="dxil-pretty-printer" < %s 2>&1 | FileCheck %s --check-prefix=PRINT ; RUN: llc %s --filetype=asm -o - < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll new file mode 100644 index 0000000..6b90e17 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll @@ -0,0 +1,89 @@ +; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s +; RUN: opt -S --passes="dxil-pretty-printer" < %s 2>&1 | FileCheck %s --check-prefix=PRINT +; RUN: llc %s --filetype=asm -o - < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT + +target triple = "dxil-pc-shadermodel6.6-compute" + +%__cblayout_CB1 = type <{ float, i32, double, <2 x i32> }> +@CB1.cb = global target("dx.CBuffer", %__cblayout_CB1) poison +@CB1.str = private unnamed_addr constant [4 x i8] c"CB1\00", align 1 + +%__cblayout_CB2 = type <{ float, target("dx.Padding", 4), double, float, half, i16, i64, i32 }> +@CB2.cb = global target("dx.CBuffer", %__cblayout_CB2) poison +@CB2.str = private unnamed_addr constant [4 x i8] c"CB2\00", align 1 + +%__cblayout_MyConstants = type <{ + double, target("dx.Padding", 8), + <3 x float>, float, + <3 x double>, half, target("dx.Padding", 6), + <2 x double>, + float, <3 x half>, <3 x half> +}> +@MyConstants.cb = global target("dx.CBuffer", %__cblayout_MyConstants) poison +@MyConstants.str = private unnamed_addr constant [12 x i8] c"MyConstants\00", align 1 + +; PRINT:; Resource Bindings: +; PRINT-NEXT:; +; PRINT-NEXT:; Name Type Format Dim ID HLSL Bind Count +; PRINT-NEXT:; ---- +; PRINT-NEXT:; CB1 cbuffer NA NA CB0 cb0 1 +; PRINT-NEXT:; CB2 cbuffer NA NA CB1 cb1 1 +; PRINT-NEXT:; MyConstants cbuffer NA NA CB2 cb5,space15 1 + +define void @test() #0 { + + ; cbuffer CB1 : register(b0) { + ; float a; + ; int b; + ; double c; + ; int2 d; + ; } + %CB1.cb_h = call target("dx.CBuffer", %__cblayout_CB1) + @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr @CB1.str) + + ; cbuffer CB2 : register(b0) { + ; float a; + ; double b; + ; float c; + ; half d; + ; uint16_t e; + ; int64_t f; + ; int g; + ;} + %CB2.cb_h = call target("dx.CBuffer", %__cblayout_CB2) + @llvm.dx.resource.handlefrombinding(i32 0, i32 1, i32 1, i32 0, ptr @CB2.str) + + ; cbuffer CB3 : register(b5) { + ; double B0; + ; float3 B1; + ; float B2; + ; double3 B3; + ; half B4; + ; double2 B5; + ; float B6; + ; half3 B7; + ; half3 B8; + ; } + %CB3.cb_h = call target("dx.CBuffer", %__cblayout_MyConstants) + @llvm.dx.resource.handlefrombinding(i32 15, i32 5, i32 1, i32 0, ptr @MyConstants.str) + + ret void +} + +attributes #0 = { noinline nounwind "hlsl.shader"="compute" } + +; CHECK: %CBuffer.CB1 = type { { float, i32, double, <2 x i32> } } +; CHECK: %CBuffer.CB2 = type { { float, double, float, half, i16, i64, i32 } } +; CHECK: %CBuffer.MyConstants = type { { double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> } } + +; CHECK: @CB1 = external constant %CBuffer.CB1 +; CHECK: @CB2 = external constant %CBuffer.CB2 +; CHECK: @MyConstants = external constant %CBuffer.MyConstants + +; CHECK: !dx.resources = !{[[ResList:[!][0-9]+]]} + +; CHECK: [[ResList]] = !{null, null, [[CBList:[!][0-9]+]], null} +; CHECK: [[CBList]] = !{![[CB1:[0-9]+]], ![[CB2:[0-9]+]], ![[MYCONSTANTS:[0-9]+]]} +; CHECK: ![[CB1]] = !{i32 0, ptr @CB1, !"CB1", i32 0, i32 0, i32 1, i32 24, null} +; CHECK: ![[CB2]] = !{i32 1, ptr @CB2, !"CB2", i32 0, i32 1, i32 1, i32 36, null} +; CHECK: ![[MYCONSTANTS]] = !{i32 2, ptr @MyConstants, !"MyConstants", i32 15, i32 5, i32 1, i32 96, null} diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll index e2a1c09..0b454c1 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll @@ -7,7 +7,7 @@ target triple = "dxil-pc-shadermodel6.6-compute" define void @cbuffer_is_only_binding() { - %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) + %cbuf = call target("dx.CBuffer", <{ float }>) @llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, ptr null) ; CHECK: %CBuffer = type { float } diff --git a/llvm/test/CodeGen/Hexagon/swp-many-stores.mir b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir new file mode 100644 index 0000000..bf14dcf --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir @@ -0,0 +1,88 @@ +# RUN: llc -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null -pipeliner-max-num-stores=5 2>&1 | FileCheck %s +# REQUIRES: asserts + +# This loop has six stores, which exceeds the limit set by +# `pipeliner-max-num-stores`. + +# CHECK: Too many stores + +--- | + target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" + target triple = "hexagon-unknown-linux-musl" + + define void @f(ptr %a, i32 %n) #0 { + entry: + %guard = icmp sgt i32 %n, 0 + %btc = sub nsw i32 %n, 1 + br i1 %guard, label %loop.preheader, label %exit + + loop.preheader: ; preds = %entry + %0 = add i32 %n, 1 + %cgep = getelementptr i8, ptr %a, i32 %0 + br label %loop + + loop: ; preds = %loop.preheader, %loop + %lsr.iv = phi ptr [ %cgep, %loop.preheader ], [ %cgep8, %loop ] + %i = phi i32 [ %i.dec, %loop ], [ %btc, %loop.preheader ] + %cgep7 = getelementptr i8, ptr %lsr.iv, i32 -2 + store i8 0, ptr %cgep7, align 1 + %cgep8 = getelementptr i8, ptr %lsr.iv, i32 -1 + store i8 1, ptr %cgep8, align 1 + store i8 2, ptr %lsr.iv, align 1 + %cgep9 = getelementptr i8, ptr %lsr.iv, i32 1 + store i8 3, ptr %cgep9, align 1 + %cgep10 = getelementptr i8, ptr %lsr.iv, i32 2 + store i8 4, ptr %cgep10, align 1 + %cgep11 = getelementptr i8, ptr %lsr.iv, i32 3 + store i8 5, ptr %cgep11, align 1 + %i.dec = sub i32 %i, 1 + %ec = icmp eq i32 %i.dec, 0 + br i1 %ec, label %exit, label %loop + + exit: ; preds = %loop, %entry + ret void + } + + attributes #0 = { "target-cpu"="hexagonv79" } +... +--- +name: f +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $r0, $r1 + + %7:intregs = COPY $r1 + %6:intregs = COPY $r0 + %8:predregs = C2_cmpgti %7, 0 + J2_jumpf %8, %bb.3, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.loop.preheader: + successors: %bb.2(0x80000000) + + %0:intregs = A2_addi %7, -1 + %1:intregs = S4_addaddi %7, %6, 1 + %10:intregs = A2_tfrsi 0 + %11:intregs = A2_tfrsi 1 + %14:intregs = COPY %0 + J2_loop0r %bb.2, %14, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + + bb.2.loop (machine-block-address-taken): + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + + %2:intregs = PHI %1, %bb.1, %4, %bb.2 + S2_storerb_io %2, -2, %10 :: (store (s8) into %ir.cgep7) + %4:intregs = A2_addi %2, -1 + S2_storerb_io %2, -1, %11 :: (store (s8) into %ir.cgep8) + S4_storeirb_io %2, 0, 2 :: (store (s8) into %ir.lsr.iv) + S4_storeirb_io %2, 1, 3 :: (store (s8) into %ir.cgep9) + S4_storeirb_io %2, 2, 4 :: (store (s8) into %ir.cgep10) + S4_storeirb_io %2, 3, 5 :: (store (s8) into %ir.cgep11) + ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.3, implicit-def dead $pc + + bb.3.exit: + PS_jmpret $r31, implicit-def dead $pc +... diff --git a/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll new file mode 100644 index 0000000..e67d031 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; The addition of vector `A` with vector of 1s currently uses `vspltisw` to generate vector of 1s followed by add operation. + +; Function for the vector type v2i64 `a + {1, 1}` +define <2 x i64> @test_v2i64(<2 x i64> %a) { +; CHECK-LABEL: test_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vspltisw v3, 1 +; CHECK-NEXT: vupklsw v3, v3 +; CHECK-NEXT: vaddudm v2, v2, v3 +; CHECK-NEXT: blr +entry: + %add = add <2 x i64> %a, splat (i64 1) + ret <2 x i64> %add +} + +; Function for the vector type v4i32 `a + {1, 1, 1, 1}` +define <4 x i32> @test_v4i32(<4 x i32> %a) { +; CHECK-LABEL: test_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vspltisw v3, 1 +; CHECK-NEXT: vadduwm v2, v2, v3 +; CHECK-NEXT: blr +entry: + %add = add <4 x i32> %a, splat (i32 1) + ret <4 x i32> %add +} + +; Function for the vector type v8i16 `a + {1, 1, 1, 1, 1, 1, 1, 1}` +define <8 x i16> @test_v8i16(<8 x i16> %a) { +; CHECK-LABEL: test_v8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vspltish v3, 1 +; CHECK-NEXT: vadduhm v2, v2, v3 +; CHECK-NEXT: blr +entry: + %add = add <8 x i16> %a, splat (i16 1) + ret <8 x i16> %add +} + +; Function for the vector type v16i8 `a + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}` +define <16 x i8> @test_16i8(<16 x i8> %a) { +; CHECK-LABEL: test_16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxspltib v3, 1 +; CHECK-NEXT: vaddubm v2, v2, v3 +; CHECK-NEXT: blr +entry: + %add = add <16 x i8> %a, splat (i8 1) + ret <16 x i8> %add +} diff --git a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/vector-all-ones.ll deleted file mode 100644 index e4c93adc..0000000 --- a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll +++ /dev/null @@ -1,23 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \ -; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s - -; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \ -; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s - -; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \ -; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s - -; Currently the generated code uses `vspltisw` to generate vector of 1s followed by add operation. -; This pattern is expected to be optimized in a future patch by using `xxleqv` to generate vector of -1s -; followed by subtraction operation. -define dso_local noundef <4 x i32> @test1(<4 x i32> %a) { -; CHECK-LABEL: test1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vspltisw v3, 1 -; CHECK-NEXT: vadduwm v2, v2, v3 -; CHECK-NEXT: blr -entry: - %add = add <4 x i32> %a, splat (i32 1) - ret <4 x i32> %add -} diff --git a/llvm/test/CodeGen/RISCV/idiv_large.ll b/llvm/test/CodeGen/RISCV/idiv_large.ll index 9937627..d7b00f6 100644 --- a/llvm/test/CodeGen/RISCV/idiv_large.ll +++ b/llvm/test/CodeGen/RISCV/idiv_large.ll @@ -1,16 +1,2315 @@ -; RUN: llc -mtriple=riscv32 < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=riscv32 < %s | FileCheck %s --check-prefix=RV32 +; RUN: llc -mtriple=riscv64 < %s | FileCheck %s --check-prefix=RV64 + +define i64 @udiv_i64(i64 %x, i64 %y) nounwind { +; RV32-LABEL: udiv_i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: call __udivdi3 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: udiv_i64: +; RV64: # %bb.0: +; RV64-NEXT: tail __udivdi3 + %res = udiv i64 %x, %y + ret i64 %res +} + +define i65 @udiv_i65(i65 %x, i65 %y) nounwind { +; RV32-LABEL: udiv_i65: +; RV32: # %bb.0: # %_udiv-special-cases +; RV32-NEXT: lw a3, 0(a2) +; RV32-NEXT: lw a4, 4(a2) +; RV32-NEXT: lw t1, 8(a2) +; RV32-NEXT: lui a2, 349525 +; RV32-NEXT: lui a5, 209715 +; RV32-NEXT: lui a6, 61681 +; RV32-NEXT: addi t0, a2, 1365 +; RV32-NEXT: addi a7, a5, 819 +; RV32-NEXT: addi a6, a6, -241 +; RV32-NEXT: srli a2, a4, 1 +; RV32-NEXT: slli a5, t1, 31 +; RV32-NEXT: slli t3, a4, 31 +; RV32-NEXT: or t2, a5, a2 +; RV32-NEXT: srli a2, a3, 1 +; RV32-NEXT: or t4, a2, t3 +; RV32-NEXT: bnez t2, .LBB1_2 +; RV32-NEXT: # %bb.1: # %_udiv-special-cases +; RV32-NEXT: srli a2, t4, 1 +; RV32-NEXT: or a2, t4, a2 +; RV32-NEXT: srli a5, a2, 2 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 4 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 8 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 16 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: srli a5, a2, 1 +; RV32-NEXT: and a5, a5, t0 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: and a5, a2, a7 +; RV32-NEXT: srli a2, a2, 2 +; RV32-NEXT: and a2, a2, a7 +; RV32-NEXT: add a2, a5, a2 +; RV32-NEXT: srli a5, a2, 4 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: and a2, a2, a6 +; RV32-NEXT: slli a5, a2, 8 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: slli a5, a2, 16 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: srli a2, a2, 24 +; RV32-NEXT: addi t3, a2, 32 +; RV32-NEXT: j .LBB1_3 +; RV32-NEXT: .LBB1_2: +; RV32-NEXT: srli a2, t2, 1 +; RV32-NEXT: or a2, t2, a2 +; RV32-NEXT: srli a5, a2, 2 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 4 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 8 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 16 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: srli a5, a2, 1 +; RV32-NEXT: and a5, a5, t0 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: and a5, a2, a7 +; RV32-NEXT: srli a2, a2, 2 +; RV32-NEXT: and a2, a2, a7 +; RV32-NEXT: add a2, a5, a2 +; RV32-NEXT: srli a5, a2, 4 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: and a2, a2, a6 +; RV32-NEXT: slli a5, a2, 8 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: slli a5, a2, 16 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: srli t3, a2, 24 +; RV32-NEXT: .LBB1_3: # %_udiv-special-cases +; RV32-NEXT: addi sp, sp, -96 +; RV32-NEXT: sw s0, 92(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 88(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 84(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 80(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 68(sp) # 4-byte Folded Spill +; RV32-NEXT: slli a2, a3, 31 +; RV32-NEXT: li t5, 64 +; RV32-NEXT: bnez a2, .LBB1_5 +; RV32-NEXT: # %bb.4: # %_udiv-special-cases +; RV32-NEXT: li s0, 64 +; RV32-NEXT: j .LBB1_6 +; RV32-NEXT: .LBB1_5: +; RV32-NEXT: srli a5, a2, 1 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 2 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 4 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 8 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 16 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: srli a5, a2, 1 +; RV32-NEXT: and a5, a5, t0 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: and a5, a2, a7 +; RV32-NEXT: srli a2, a2, 2 +; RV32-NEXT: and a2, a2, a7 +; RV32-NEXT: add a2, a5, a2 +; RV32-NEXT: srli a5, a2, 4 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: and a2, a2, a6 +; RV32-NEXT: slli a5, a2, 8 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: slli a5, a2, 16 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: srli s0, a2, 24 +; RV32-NEXT: .LBB1_6: # %_udiv-special-cases +; RV32-NEXT: lw a5, 0(a1) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw s2, 8(a1) +; RV32-NEXT: or a1, t4, t2 +; RV32-NEXT: addi s1, s0, 64 +; RV32-NEXT: bnez a1, .LBB1_8 +; RV32-NEXT: # %bb.7: # %_udiv-special-cases +; RV32-NEXT: mv t3, s1 +; RV32-NEXT: .LBB1_8: # %_udiv-special-cases +; RV32-NEXT: snez s4, a1 +; RV32-NEXT: srli a1, a2, 1 +; RV32-NEXT: slli t2, s2, 31 +; RV32-NEXT: slli t4, a2, 31 +; RV32-NEXT: or a1, t2, a1 +; RV32-NEXT: srli t2, a5, 1 +; RV32-NEXT: or t6, t2, t4 +; RV32-NEXT: bnez a1, .LBB1_10 +; RV32-NEXT: # %bb.9: # %_udiv-special-cases +; RV32-NEXT: srli t2, t6, 1 +; RV32-NEXT: or t2, t6, t2 +; RV32-NEXT: srli t4, t2, 2 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: srli t4, t2, 4 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: srli t4, t2, 8 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: srli t4, t2, 16 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: not t2, t2 +; RV32-NEXT: srli t4, t2, 1 +; RV32-NEXT: and t4, t4, t0 +; RV32-NEXT: sub t2, t2, t4 +; RV32-NEXT: and t4, t2, a7 +; RV32-NEXT: srli t2, t2, 2 +; RV32-NEXT: and t2, t2, a7 +; RV32-NEXT: add t2, t4, t2 +; RV32-NEXT: srli t4, t2, 4 +; RV32-NEXT: add t2, t2, t4 +; RV32-NEXT: and t2, t2, a6 +; RV32-NEXT: slli t4, t2, 8 +; RV32-NEXT: add t2, t2, t4 +; RV32-NEXT: slli t4, t2, 16 +; RV32-NEXT: add t2, t2, t4 +; RV32-NEXT: srli t2, t2, 24 +; RV32-NEXT: addi s3, t2, 32 +; RV32-NEXT: j .LBB1_11 +; RV32-NEXT: .LBB1_10: +; RV32-NEXT: srli t2, a1, 1 +; RV32-NEXT: or t2, a1, t2 +; RV32-NEXT: srli t4, t2, 2 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: srli t4, t2, 4 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: srli t4, t2, 8 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: srli t4, t2, 16 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: not t2, t2 +; RV32-NEXT: srli t4, t2, 1 +; RV32-NEXT: and t4, t4, t0 +; RV32-NEXT: sub t2, t2, t4 +; RV32-NEXT: and t4, t2, a7 +; RV32-NEXT: srli t2, t2, 2 +; RV32-NEXT: and t2, t2, a7 +; RV32-NEXT: add t2, t4, t2 +; RV32-NEXT: srli t4, t2, 4 +; RV32-NEXT: add t2, t2, t4 +; RV32-NEXT: and t2, t2, a6 +; RV32-NEXT: slli t4, t2, 8 +; RV32-NEXT: add t2, t2, t4 +; RV32-NEXT: slli t4, t2, 16 +; RV32-NEXT: add t2, t2, t4 +; RV32-NEXT: srli s3, t2, 24 +; RV32-NEXT: .LBB1_11: # %_udiv-special-cases +; RV32-NEXT: andi t4, s2, 1 +; RV32-NEXT: andi t1, t1, 1 +; RV32-NEXT: or t2, a3, a4 +; RV32-NEXT: or s2, a5, a2 +; RV32-NEXT: sltu s0, s1, s0 +; RV32-NEXT: slli s1, a5, 31 +; RV32-NEXT: addi s4, s4, -1 +; RV32-NEXT: beqz s1, .LBB1_13 +; RV32-NEXT: # %bb.12: +; RV32-NEXT: srli t5, s1, 1 +; RV32-NEXT: or t5, s1, t5 +; RV32-NEXT: srli s1, t5, 2 +; RV32-NEXT: or t5, t5, s1 +; RV32-NEXT: srli s1, t5, 4 +; RV32-NEXT: or t5, t5, s1 +; RV32-NEXT: srli s1, t5, 8 +; RV32-NEXT: or t5, t5, s1 +; RV32-NEXT: srli s1, t5, 16 +; RV32-NEXT: or t5, t5, s1 +; RV32-NEXT: not t5, t5 +; RV32-NEXT: srli s1, t5, 1 +; RV32-NEXT: and t0, s1, t0 +; RV32-NEXT: sub t0, t5, t0 +; RV32-NEXT: and t5, t0, a7 +; RV32-NEXT: srli t0, t0, 2 +; RV32-NEXT: and a7, t0, a7 +; RV32-NEXT: add a7, t5, a7 +; RV32-NEXT: srli t0, a7, 4 +; RV32-NEXT: add a7, a7, t0 +; RV32-NEXT: and a6, a7, a6 +; RV32-NEXT: slli a7, a6, 8 +; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: slli a7, a6, 16 +; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: srli t5, a6, 24 +; RV32-NEXT: .LBB1_13: # %_udiv-special-cases +; RV32-NEXT: or t0, t2, t1 +; RV32-NEXT: or a6, s2, t4 +; RV32-NEXT: and a7, s4, s0 +; RV32-NEXT: or t6, t6, a1 +; RV32-NEXT: addi s0, t5, 64 +; RV32-NEXT: bnez t6, .LBB1_15 +; RV32-NEXT: # %bb.14: # %_udiv-special-cases +; RV32-NEXT: mv s3, s0 +; RV32-NEXT: .LBB1_15: # %_udiv-special-cases +; RV32-NEXT: seqz a1, t0 +; RV32-NEXT: sltu t0, s0, t5 +; RV32-NEXT: snez t5, t6 +; RV32-NEXT: addi t5, t5, -1 +; RV32-NEXT: and t0, t5, t0 +; RV32-NEXT: sltu t5, t3, s3 +; RV32-NEXT: seqz a6, a6 +; RV32-NEXT: mv t6, t5 +; RV32-NEXT: beq a7, t0, .LBB1_17 +; RV32-NEXT: # %bb.16: # %_udiv-special-cases +; RV32-NEXT: sltu t6, a7, t0 +; RV32-NEXT: .LBB1_17: # %_udiv-special-cases +; RV32-NEXT: or a1, a1, a6 +; RV32-NEXT: andi a6, t6, 1 +; RV32-NEXT: sub a7, a7, t0 +; RV32-NEXT: sub t5, a7, t5 +; RV32-NEXT: sub a7, t3, s3 +; RV32-NEXT: beqz a6, .LBB1_19 +; RV32-NEXT: # %bb.18: # %_udiv-special-cases +; RV32-NEXT: mv t0, a6 +; RV32-NEXT: j .LBB1_20 +; RV32-NEXT: .LBB1_19: +; RV32-NEXT: sltiu t0, a7, 65 +; RV32-NEXT: xori t0, t0, 1 +; RV32-NEXT: snez t3, t5 +; RV32-NEXT: or t0, t0, t3 +; RV32-NEXT: .LBB1_20: # %_udiv-special-cases +; RV32-NEXT: or t6, a1, t0 +; RV32-NEXT: addi a1, t6, -1 +; RV32-NEXT: and t3, t4, a1 +; RV32-NEXT: and t0, a1, a2 +; RV32-NEXT: and a1, a1, a5 +; RV32-NEXT: bnez t6, .LBB1_30 +; RV32-NEXT: # %bb.21: # %_udiv-special-cases +; RV32-NEXT: xori t6, a7, 64 +; RV32-NEXT: or t6, t6, a6 +; RV32-NEXT: or t6, t6, t5 +; RV32-NEXT: beqz t6, .LBB1_30 +; RV32-NEXT: # %bb.22: # %udiv-bb1 +; RV32-NEXT: addi a1, a7, 1 +; RV32-NEXT: sw zero, 32(sp) +; RV32-NEXT: sw zero, 36(sp) +; RV32-NEXT: sw zero, 40(sp) +; RV32-NEXT: sw zero, 44(sp) +; RV32-NEXT: sw a5, 48(sp) +; RV32-NEXT: sw a2, 52(sp) +; RV32-NEXT: sw t4, 56(sp) +; RV32-NEXT: li t0, 64 +; RV32-NEXT: addi t3, sp, 48 +; RV32-NEXT: neg s1, a7 +; RV32-NEXT: seqz t6, a1 +; RV32-NEXT: sub a7, t0, a7 +; RV32-NEXT: add t5, t5, t6 +; RV32-NEXT: andi t0, a7, 31 +; RV32-NEXT: srli a7, a7, 3 +; RV32-NEXT: or t6, a1, t5 +; RV32-NEXT: xori s2, t0, 31 +; RV32-NEXT: andi a7, a7, 12 +; RV32-NEXT: seqz t0, t6 +; RV32-NEXT: sub s3, t3, a7 +; RV32-NEXT: add a6, a6, t0 +; RV32-NEXT: lw t3, 0(s3) +; RV32-NEXT: lw s4, 4(s3) +; RV32-NEXT: andi a7, a6, 1 +; RV32-NEXT: or t6, t6, a7 +; RV32-NEXT: srli a6, t3, 1 +; RV32-NEXT: sll t0, s4, s1 +; RV32-NEXT: srl a6, a6, s2 +; RV32-NEXT: or t0, t0, a6 +; RV32-NEXT: sll a6, t3, s1 +; RV32-NEXT: li t3, 0 +; RV32-NEXT: beqz t6, .LBB1_28 +; RV32-NEXT: # %bb.23: # %udiv-preheader +; RV32-NEXT: li t6, 0 +; RV32-NEXT: li s0, 0 +; RV32-NEXT: srli s4, s4, 1 +; RV32-NEXT: lw s3, 8(s3) +; RV32-NEXT: sw zero, 16(sp) +; RV32-NEXT: sw zero, 20(sp) +; RV32-NEXT: sw zero, 24(sp) +; RV32-NEXT: sw zero, 28(sp) +; RV32-NEXT: sw a5, 0(sp) +; RV32-NEXT: sw a2, 4(sp) +; RV32-NEXT: sw t4, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: srli a2, a1, 3 +; RV32-NEXT: srl a5, s4, s2 +; RV32-NEXT: mv t4, sp +; RV32-NEXT: snez t2, t2 +; RV32-NEXT: andi a2, a2, 12 +; RV32-NEXT: add t1, t1, t2 +; RV32-NEXT: add a2, t4, a2 +; RV32-NEXT: lw t2, 0(a2) +; RV32-NEXT: lw t4, 4(a2) +; RV32-NEXT: lw a2, 8(a2) +; RV32-NEXT: sll s1, s3, s1 +; RV32-NEXT: andi s2, a1, 31 +; RV32-NEXT: xori s2, s2, 31 +; RV32-NEXT: or s3, s1, a5 +; RV32-NEXT: slli a2, a2, 1 +; RV32-NEXT: slli a5, t4, 1 +; RV32-NEXT: sll a2, a2, s2 +; RV32-NEXT: sll s2, a5, s2 +; RV32-NEXT: srl s1, t4, a1 +; RV32-NEXT: or s1, s1, a2 +; RV32-NEXT: seqz a2, a3 +; RV32-NEXT: sub a2, a4, a2 +; RV32-NEXT: addi a5, t1, 1 +; RV32-NEXT: andi a5, a5, 1 +; RV32-NEXT: andi s3, s3, 1 +; RV32-NEXT: srl t1, t2, a1 +; RV32-NEXT: or s2, t1, s2 +; RV32-NEXT: addi t1, a3, -1 +; RV32-NEXT: j .LBB1_26 +; RV32-NEXT: .LBB1_24: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB1_26 Depth=1 +; RV32-NEXT: sltu t2, a2, s4 +; RV32-NEXT: .LBB1_25: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB1_26 Depth=1 +; RV32-NEXT: srli s1, s1, 31 +; RV32-NEXT: sub t4, a5, s1 +; RV32-NEXT: sub t2, t4, t2 +; RV32-NEXT: slli t2, t2, 31 +; RV32-NEXT: srai s1, t2, 31 +; RV32-NEXT: and s3, s1, a4 +; RV32-NEXT: li t2, 0 +; RV32-NEXT: li t4, 0 +; RV32-NEXT: srli s5, a6, 31 +; RV32-NEXT: sub s4, s4, s3 +; RV32-NEXT: slli s3, t0, 1 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli t0, t0, 31 +; RV32-NEXT: slli a6, a6, 1 +; RV32-NEXT: or a6, t3, a6 +; RV32-NEXT: seqz t3, a1 +; RV32-NEXT: or s0, s0, t0 +; RV32-NEXT: or s5, a1, t5 +; RV32-NEXT: sub t5, t5, t3 +; RV32-NEXT: and s6, s1, a3 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: andi t3, s1, 1 +; RV32-NEXT: or t0, t6, s3 +; RV32-NEXT: sltu t6, s2, s6 +; RV32-NEXT: snez s5, s5 +; RV32-NEXT: andi s3, s0, 1 +; RV32-NEXT: sub s1, s4, t6 +; RV32-NEXT: add a7, a7, s5 +; RV32-NEXT: addi a7, a7, 1 +; RV32-NEXT: andi a7, a7, 1 +; RV32-NEXT: or t6, a1, t5 +; RV32-NEXT: or s4, t6, a7 +; RV32-NEXT: sub s2, s2, s6 +; RV32-NEXT: li t6, 0 +; RV32-NEXT: li s0, 0 +; RV32-NEXT: beqz s4, .LBB1_29 +; RV32-NEXT: .LBB1_26: # %udiv-do-while +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: srli t2, s2, 31 +; RV32-NEXT: slli t4, s1, 1 +; RV32-NEXT: slli s2, s2, 1 +; RV32-NEXT: or s4, t4, t2 +; RV32-NEXT: andi t2, s3, 1 +; RV32-NEXT: or s2, s2, t2 +; RV32-NEXT: bne a2, s4, .LBB1_24 +; RV32-NEXT: # %bb.27: # in Loop: Header=BB1_26 Depth=1 +; RV32-NEXT: sltu t2, t1, s2 +; RV32-NEXT: j .LBB1_25 +; RV32-NEXT: .LBB1_28: +; RV32-NEXT: li t2, 0 +; RV32-NEXT: li t4, 0 +; RV32-NEXT: .LBB1_29: # %udiv-loop-exit +; RV32-NEXT: srli a2, a6, 31 +; RV32-NEXT: slli a3, t0, 1 +; RV32-NEXT: srli a4, t0, 31 +; RV32-NEXT: slli a6, a6, 1 +; RV32-NEXT: or a1, t3, a6 +; RV32-NEXT: or a2, t2, a2 +; RV32-NEXT: or a4, t4, a4 +; RV32-NEXT: or t0, a2, a3 +; RV32-NEXT: andi t3, a4, 1 +; RV32-NEXT: .LBB1_30: # %udiv-end +; RV32-NEXT: andi a2, t3, 1 +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: sw t0, 4(a0) +; RV32-NEXT: sb a2, 8(a0) +; RV32-NEXT: lw s0, 92(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 88(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 84(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 80(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 68(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 96 +; RV32-NEXT: ret +; +; RV64-LABEL: udiv_i65: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: andi a3, a3, 1 +; RV64-NEXT: call __udivti3 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %res = udiv i65 %x, %y + ret i65 %res +} define i128 @udiv_i128(i128 %x, i128 %y) nounwind { -; CHECK-LABEL: udiv_i128: -; CHECK: call __udivti3 +; RV32-LABEL: udiv_i128: +; RV32: # %bb.0: # %_udiv-special-cases +; RV32-NEXT: addi sp, sp, -160 +; RV32-NEXT: sw ra, 156(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 152(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 148(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 144(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 140(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 136(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 132(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 128(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 124(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 120(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill +; RV32-NEXT: mv s7, a0 +; RV32-NEXT: lw s8, 0(a2) +; RV32-NEXT: lw s9, 4(a2) +; RV32-NEXT: lw s11, 8(a2) +; RV32-NEXT: lw ra, 12(a2) +; RV32-NEXT: lui t4, 349525 +; RV32-NEXT: addi t4, t4, 1365 +; RV32-NEXT: lui t3, 209715 +; RV32-NEXT: addi t3, t3, 819 +; RV32-NEXT: lui t2, 61681 +; RV32-NEXT: addi t2, t2, -241 +; RV32-NEXT: bnez s9, .LBB2_2 +; RV32-NEXT: # %bb.1: # %_udiv-special-cases +; RV32-NEXT: srli a0, s8, 1 +; RV32-NEXT: or a0, s8, a0 +; RV32-NEXT: srli a3, a0, 2 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 8 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 16 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: srli a3, a0, 1 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: sub a0, a0, a3 +; RV32-NEXT: and a3, a0, t3 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: and a0, a0, t3 +; RV32-NEXT: add a0, a3, a0 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: and a0, a0, t2 +; RV32-NEXT: slli a3, a0, 8 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: slli a3, a0, 16 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: addi t6, a0, 32 +; RV32-NEXT: j .LBB2_3 +; RV32-NEXT: .LBB2_2: +; RV32-NEXT: srli a0, s9, 1 +; RV32-NEXT: or a0, s9, a0 +; RV32-NEXT: srli a3, a0, 2 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 8 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 16 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: srli a3, a0, 1 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: sub a0, a0, a3 +; RV32-NEXT: and a3, a0, t3 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: and a0, a0, t3 +; RV32-NEXT: add a0, a3, a0 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: and a0, a0, t2 +; RV32-NEXT: slli a3, a0, 8 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: slli a3, a0, 16 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: srli t6, a0, 24 +; RV32-NEXT: .LBB2_3: # %_udiv-special-cases +; RV32-NEXT: lw a6, 4(a1) +; RV32-NEXT: or s0, s11, ra +; RV32-NEXT: bnez ra, .LBB2_5 +; RV32-NEXT: # %bb.4: # %_udiv-special-cases +; RV32-NEXT: srli a0, s11, 1 +; RV32-NEXT: or a0, s11, a0 +; RV32-NEXT: srli a3, a0, 2 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 8 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 16 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: srli a3, a0, 1 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: sub a0, a0, a3 +; RV32-NEXT: and a3, a0, t3 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: and a0, a0, t3 +; RV32-NEXT: add a0, a3, a0 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: and a0, a0, t2 +; RV32-NEXT: slli a3, a0, 8 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: slli a3, a0, 16 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: addi t5, a0, 32 +; RV32-NEXT: j .LBB2_6 +; RV32-NEXT: .LBB2_5: +; RV32-NEXT: srli a0, ra, 1 +; RV32-NEXT: or a0, ra, a0 +; RV32-NEXT: srli a3, a0, 2 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 8 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 16 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: srli a3, a0, 1 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: sub a0, a0, a3 +; RV32-NEXT: and a3, a0, t3 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: and a0, a0, t3 +; RV32-NEXT: add a0, a3, a0 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: and a0, a0, t2 +; RV32-NEXT: slli a3, a0, 8 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: slli a3, a0, 16 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: srli t5, a0, 24 +; RV32-NEXT: .LBB2_6: # %_udiv-special-cases +; RV32-NEXT: lw a7, 12(a1) +; RV32-NEXT: addi a0, t6, 64 +; RV32-NEXT: bnez s0, .LBB2_8 +; RV32-NEXT: # %bb.7: # %_udiv-special-cases +; RV32-NEXT: mv t5, a0 +; RV32-NEXT: .LBB2_8: # %_udiv-special-cases +; RV32-NEXT: lw t1, 0(a1) +; RV32-NEXT: lw t0, 8(a1) +; RV32-NEXT: snez s3, s0 +; RV32-NEXT: bnez a6, .LBB2_10 +; RV32-NEXT: # %bb.9: # %_udiv-special-cases +; RV32-NEXT: srli a1, t1, 1 +; RV32-NEXT: or a1, t1, a1 +; RV32-NEXT: srli a3, a1, 2 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: srli a3, a1, 4 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: srli a3, a1, 8 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: srli a3, a1, 16 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: not a1, a1 +; RV32-NEXT: srli a3, a1, 1 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: sub a1, a1, a3 +; RV32-NEXT: and a3, a1, t3 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: and a1, a1, t3 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: srli a3, a1, 4 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: and a1, a1, t2 +; RV32-NEXT: slli a3, a1, 8 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: slli a3, a1, 16 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: srli a1, a1, 24 +; RV32-NEXT: addi a3, a1, 32 +; RV32-NEXT: j .LBB2_11 +; RV32-NEXT: .LBB2_10: +; RV32-NEXT: srli a1, a6, 1 +; RV32-NEXT: or a1, a6, a1 +; RV32-NEXT: srli a3, a1, 2 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: srli a3, a1, 4 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: srli a3, a1, 8 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: srli a3, a1, 16 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: not a1, a1 +; RV32-NEXT: srli a3, a1, 1 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: sub a1, a1, a3 +; RV32-NEXT: and a3, a1, t3 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: and a1, a1, t3 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: srli a3, a1, 4 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: and a1, a1, t2 +; RV32-NEXT: slli a3, a1, 8 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: slli a3, a1, 16 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: srli a3, a1, 24 +; RV32-NEXT: .LBB2_11: # %_udiv-special-cases +; RV32-NEXT: or a1, s9, ra +; RV32-NEXT: or s0, s8, s11 +; RV32-NEXT: or s1, a6, a7 +; RV32-NEXT: or s2, t1, t0 +; RV32-NEXT: sltu t6, a0, t6 +; RV32-NEXT: addi s3, s3, -1 +; RV32-NEXT: addi a0, a3, 64 +; RV32-NEXT: or s4, t0, a7 +; RV32-NEXT: sltu s5, a0, a3 +; RV32-NEXT: snez s6, s4 +; RV32-NEXT: addi s6, s6, -1 +; RV32-NEXT: bnez a7, .LBB2_13 +; RV32-NEXT: # %bb.12: # %_udiv-special-cases +; RV32-NEXT: srli a3, t0, 1 +; RV32-NEXT: or a3, t0, a3 +; RV32-NEXT: srli a4, a3, 2 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 8 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 16 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: not a3, a3 +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: and a4, a4, t4 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: and a4, a3, t3 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: and a3, a3, t3 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: and a3, a3, t2 +; RV32-NEXT: slli a4, a3, 8 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a3, 16 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: srli a3, a3, 24 +; RV32-NEXT: addi a3, a3, 32 +; RV32-NEXT: j .LBB2_14 +; RV32-NEXT: .LBB2_13: +; RV32-NEXT: srli a3, a7, 1 +; RV32-NEXT: or a3, a7, a3 +; RV32-NEXT: srli a4, a3, 2 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 8 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 16 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: not a3, a3 +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: and a4, a4, t4 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: and a4, a3, t3 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: and a3, a3, t3 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: and a3, a3, t2 +; RV32-NEXT: slli a4, a3, 8 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a3, 16 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: srli a3, a3, 24 +; RV32-NEXT: .LBB2_14: # %_udiv-special-cases +; RV32-NEXT: or s0, s0, a1 +; RV32-NEXT: or a5, s2, s1 +; RV32-NEXT: and a1, s3, t6 +; RV32-NEXT: and a4, s6, s5 +; RV32-NEXT: bnez s4, .LBB2_16 +; RV32-NEXT: # %bb.15: # %_udiv-special-cases +; RV32-NEXT: mv a3, a0 +; RV32-NEXT: .LBB2_16: # %_udiv-special-cases +; RV32-NEXT: seqz a0, s0 +; RV32-NEXT: seqz a5, a5 +; RV32-NEXT: sltu t2, t5, a3 +; RV32-NEXT: sub t4, a1, a4 +; RV32-NEXT: mv t3, t2 +; RV32-NEXT: beq a1, a4, .LBB2_18 +; RV32-NEXT: # %bb.17: # %_udiv-special-cases +; RV32-NEXT: sltu t3, a1, a4 +; RV32-NEXT: .LBB2_18: # %_udiv-special-cases +; RV32-NEXT: sub t2, t4, t2 +; RV32-NEXT: or a0, a0, a5 +; RV32-NEXT: neg t4, t3 +; RV32-NEXT: seqz t6, t3 +; RV32-NEXT: addi t6, t6, -1 +; RV32-NEXT: or a1, t4, t6 +; RV32-NEXT: sub t3, t5, a3 +; RV32-NEXT: beqz a1, .LBB2_20 +; RV32-NEXT: # %bb.19: # %_udiv-special-cases +; RV32-NEXT: snez a1, a1 +; RV32-NEXT: j .LBB2_21 +; RV32-NEXT: .LBB2_20: +; RV32-NEXT: snez a1, t2 +; RV32-NEXT: sltiu a3, t3, 128 +; RV32-NEXT: xori a3, a3, 1 +; RV32-NEXT: or a1, a3, a1 +; RV32-NEXT: .LBB2_21: # %_udiv-special-cases +; RV32-NEXT: or a5, a0, a1 +; RV32-NEXT: addi a3, a5, -1 +; RV32-NEXT: and a0, a3, a7 +; RV32-NEXT: and a1, a3, t0 +; RV32-NEXT: and a4, a3, a6 +; RV32-NEXT: and a3, a3, t1 +; RV32-NEXT: bnez a5, .LBB2_26 +; RV32-NEXT: # %bb.22: # %_udiv-special-cases +; RV32-NEXT: xori a5, t3, 127 +; RV32-NEXT: or a5, a5, t4 +; RV32-NEXT: or t5, t2, t6 +; RV32-NEXT: or a5, a5, t5 +; RV32-NEXT: beqz a5, .LBB2_26 +; RV32-NEXT: # %bb.23: # %udiv-bb1 +; RV32-NEXT: sw s7, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: addi a1, t3, 1 +; RV32-NEXT: sw zero, 72(sp) +; RV32-NEXT: sw zero, 76(sp) +; RV32-NEXT: sw zero, 80(sp) +; RV32-NEXT: sw zero, 84(sp) +; RV32-NEXT: sw t1, 88(sp) +; RV32-NEXT: sw a6, 92(sp) +; RV32-NEXT: sw t0, 96(sp) +; RV32-NEXT: sw a7, 100(sp) +; RV32-NEXT: li a0, 127 +; RV32-NEXT: addi a2, sp, 88 +; RV32-NEXT: seqz a3, a1 +; RV32-NEXT: sub a0, a0, t3 +; RV32-NEXT: add t2, t2, a3 +; RV32-NEXT: andi a3, a0, 31 +; RV32-NEXT: srli a0, a0, 3 +; RV32-NEXT: or a4, a1, t2 +; RV32-NEXT: xori a3, a3, 31 +; RV32-NEXT: andi a0, a0, 12 +; RV32-NEXT: seqz t5, a4 +; RV32-NEXT: sub a2, a2, a0 +; RV32-NEXT: add t5, t4, t5 +; RV32-NEXT: lw a0, 0(a2) +; RV32-NEXT: lw a4, 4(a2) +; RV32-NEXT: lw a5, 8(a2) +; RV32-NEXT: lw a2, 12(a2) +; RV32-NEXT: sltu t4, t5, t4 +; RV32-NEXT: or s0, a1, t5 +; RV32-NEXT: add t4, t6, t4 +; RV32-NEXT: or t6, t2, t4 +; RV32-NEXT: or s0, s0, t6 +; RV32-NEXT: srli t6, a5, 1 +; RV32-NEXT: srli s1, a4, 1 +; RV32-NEXT: srli s2, a0, 1 +; RV32-NEXT: srl t6, t6, a3 +; RV32-NEXT: srl s1, s1, a3 +; RV32-NEXT: srl a3, s2, a3 +; RV32-NEXT: not t3, t3 +; RV32-NEXT: sll a2, a2, t3 +; RV32-NEXT: or s2, a2, t6 +; RV32-NEXT: sll a2, a5, t3 +; RV32-NEXT: sll a4, a4, t3 +; RV32-NEXT: or s1, a2, s1 +; RV32-NEXT: or t6, a4, a3 +; RV32-NEXT: sll t3, a0, t3 +; RV32-NEXT: bnez s0, .LBB2_27 +; RV32-NEXT: # %bb.24: +; RV32-NEXT: li s6, 0 +; RV32-NEXT: li s7, 0 +; RV32-NEXT: li s8, 0 +; RV32-NEXT: .LBB2_25: # %udiv-loop-exit +; RV32-NEXT: srli a0, s1, 31 +; RV32-NEXT: slli s2, s2, 1 +; RV32-NEXT: or a0, s2, a0 +; RV32-NEXT: srli a1, t6, 31 +; RV32-NEXT: slli s1, s1, 1 +; RV32-NEXT: or a1, s1, a1 +; RV32-NEXT: srli a2, t3, 31 +; RV32-NEXT: slli t6, t6, 1 +; RV32-NEXT: slli a3, t3, 1 +; RV32-NEXT: or a3, s0, a3 +; RV32-NEXT: or a2, s6, a2 +; RV32-NEXT: or a4, a2, t6 +; RV32-NEXT: or a1, s7, a1 +; RV32-NEXT: or a0, s8, a0 +; RV32-NEXT: lw s7, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: .LBB2_26: # %udiv-end +; RV32-NEXT: sw a3, 0(s7) +; RV32-NEXT: sw a4, 4(s7) +; RV32-NEXT: sw a1, 8(s7) +; RV32-NEXT: sw a0, 12(s7) +; RV32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 148(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 144(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 140(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 136(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 132(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 128(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 124(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s8, 120(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s9, 116(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s10, 112(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s11, 108(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 160 +; RV32-NEXT: ret +; RV32-NEXT: .LBB2_27: # %udiv-preheader +; RV32-NEXT: li s0, 0 +; RV32-NEXT: li s5, 0 +; RV32-NEXT: li s3, 0 +; RV32-NEXT: li s4, 0 +; RV32-NEXT: sw zero, 56(sp) +; RV32-NEXT: sw zero, 60(sp) +; RV32-NEXT: sw zero, 64(sp) +; RV32-NEXT: sw zero, 68(sp) +; RV32-NEXT: sw t1, 40(sp) +; RV32-NEXT: sw a6, 44(sp) +; RV32-NEXT: sw t0, 48(sp) +; RV32-NEXT: sw a7, 52(sp) +; RV32-NEXT: srli a0, a1, 3 +; RV32-NEXT: addi a2, sp, 40 +; RV32-NEXT: andi a0, a0, 12 +; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: lw a2, 4(a0) +; RV32-NEXT: lw a3, 8(a0) +; RV32-NEXT: lw a4, 12(a0) +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: andi a5, a1, 31 +; RV32-NEXT: xori a5, a5, 31 +; RV32-NEXT: slli a6, a4, 1 +; RV32-NEXT: slli a7, a3, 1 +; RV32-NEXT: slli t0, a2, 1 +; RV32-NEXT: sll a6, a6, a5 +; RV32-NEXT: sll a7, a7, a5 +; RV32-NEXT: sll a5, t0, a5 +; RV32-NEXT: seqz t0, s8 +; RV32-NEXT: srl a3, a3, a1 +; RV32-NEXT: or s10, a3, a6 +; RV32-NEXT: or a3, s8, s9 +; RV32-NEXT: sw s9, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: sub a6, s9, t0 +; RV32-NEXT: seqz a3, a3 +; RV32-NEXT: srl a2, a2, a1 +; RV32-NEXT: or s9, a2, a7 +; RV32-NEXT: sub a7, s11, a3 +; RV32-NEXT: sw s11, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sltu a2, s11, a3 +; RV32-NEXT: sw ra, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sub a2, ra, a2 +; RV32-NEXT: sw a2, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: srl a0, a0, a1 +; RV32-NEXT: srl ra, a4, a1 +; RV32-NEXT: or t1, a0, a5 +; RV32-NEXT: sw s8, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s8, s8, -1 +; RV32-NEXT: sw s8, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: li s7, 0 +; RV32-NEXT: li s8, 0 +; RV32-NEXT: j .LBB2_29 +; RV32-NEXT: .LBB2_28: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1 +; RV32-NEXT: li s6, 0 +; RV32-NEXT: sub a0, a0, a5 +; RV32-NEXT: srli a5, s1, 31 +; RV32-NEXT: slli s2, s2, 1 +; RV32-NEXT: or a5, s2, a5 +; RV32-NEXT: srli s2, t6, 31 +; RV32-NEXT: slli s1, s1, 1 +; RV32-NEXT: or s1, s1, s2 +; RV32-NEXT: srli s2, t3, 31 +; RV32-NEXT: slli t6, t6, 1 +; RV32-NEXT: slli t3, t3, 1 +; RV32-NEXT: or t6, t6, s2 +; RV32-NEXT: lw a2, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: and s2, s10, a2 +; RV32-NEXT: or t3, s0, t3 +; RV32-NEXT: sub a2, a3, s2 +; RV32-NEXT: sltu a3, a3, s2 +; RV32-NEXT: lw t0, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: and s0, s10, t0 +; RV32-NEXT: sub t0, s9, s0 +; RV32-NEXT: or s2, a1, t2 +; RV32-NEXT: sub s9, a0, a4 +; RV32-NEXT: seqz a0, a1 +; RV32-NEXT: sub t2, t2, a0 +; RV32-NEXT: or t6, s5, t6 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: andi s0, s10, 1 +; RV32-NEXT: seqz a0, s2 +; RV32-NEXT: or s1, s3, s1 +; RV32-NEXT: or s2, s4, a5 +; RV32-NEXT: sub s10, a2, ra +; RV32-NEXT: sltu a2, a2, ra +; RV32-NEXT: sub a3, t0, a3 +; RV32-NEXT: sltu a4, t5, a0 +; RV32-NEXT: sub t5, t5, a0 +; RV32-NEXT: sub ra, a3, a2 +; RV32-NEXT: sub t4, t4, a4 +; RV32-NEXT: or a0, t2, t4 +; RV32-NEXT: or a2, a1, t5 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: sub t1, s11, t1 +; RV32-NEXT: li s5, 0 +; RV32-NEXT: li s3, 0 +; RV32-NEXT: li s4, 0 +; RV32-NEXT: beqz a0, .LBB2_25 +; RV32-NEXT: .LBB2_29: # %udiv-do-while +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: srli a0, t1, 31 +; RV32-NEXT: slli a3, s9, 1 +; RV32-NEXT: slli t1, t1, 1 +; RV32-NEXT: or a0, a3, a0 +; RV32-NEXT: srli a3, s2, 31 +; RV32-NEXT: or s11, t1, a3 +; RV32-NEXT: beq a6, a0, .LBB2_31 +; RV32-NEXT: # %bb.30: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1 +; RV32-NEXT: sltu a4, a6, a0 +; RV32-NEXT: j .LBB2_32 +; RV32-NEXT: .LBB2_31: # in Loop: Header=BB2_29 Depth=1 +; RV32-NEXT: lw a2, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: sltu a4, a2, s11 +; RV32-NEXT: .LBB2_32: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1 +; RV32-NEXT: lw a2, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: srli a3, s10, 31 +; RV32-NEXT: slli ra, ra, 1 +; RV32-NEXT: srli a5, s9, 31 +; RV32-NEXT: slli s10, s10, 1 +; RV32-NEXT: or s9, ra, a3 +; RV32-NEXT: or a3, s10, a5 +; RV32-NEXT: sub a5, a7, a3 +; RV32-NEXT: sltu t1, a7, a3 +; RV32-NEXT: lw t0, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: sub s6, t0, s9 +; RV32-NEXT: sltu a4, a5, a4 +; RV32-NEXT: sub a5, s6, t1 +; RV32-NEXT: sub a5, a5, a4 +; RV32-NEXT: srai s10, a5, 31 +; RV32-NEXT: and t1, s10, a2 +; RV32-NEXT: lw a2, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: and a5, s10, a2 +; RV32-NEXT: sltu a4, s11, t1 +; RV32-NEXT: mv ra, a4 +; RV32-NEXT: beq a0, a5, .LBB2_28 +; RV32-NEXT: # %bb.33: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1 +; RV32-NEXT: sltu ra, a0, a5 +; RV32-NEXT: j .LBB2_28 +; +; RV64-LABEL: udiv_i128: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: call __udivti3 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret %res = udiv i128 %x, %y ret i128 %res } define i129 @udiv_i129(i129 %x, i129 %y) nounwind { -; CHECK-LABEL: udiv_i129: -; CHECK-NOT: call{{.*}}div +; RV32-LABEL: udiv_i129: +; RV32: # %bb.0: # %_udiv-special-cases +; RV32-NEXT: addi sp, sp, -240 +; RV32-NEXT: sw ra, 236(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 232(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 228(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 224(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 220(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 216(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 212(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 208(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 204(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 200(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s9, 196(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s10, 192(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s11, 188(sp) # 4-byte Folded Spill +; RV32-NEXT: mv ra, a0 +; RV32-NEXT: lw t2, 16(a2) +; RV32-NEXT: lw a4, 0(a2) +; RV32-NEXT: lw a5, 4(a2) +; RV32-NEXT: lw a6, 8(a2) +; RV32-NEXT: lw a0, 12(a2) +; RV32-NEXT: sw a0, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: lui a0, 349525 +; RV32-NEXT: lui a2, 209715 +; RV32-NEXT: lui a3, 61681 +; RV32-NEXT: addi t5, a0, 1365 +; RV32-NEXT: addi t4, a2, 819 +; RV32-NEXT: addi t3, a3, -241 +; RV32-NEXT: sw a6, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: slli a0, a6, 31 +; RV32-NEXT: srli a2, a5, 1 +; RV32-NEXT: sw a5, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: slli a3, a5, 31 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: sw a4, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: srli a2, a4, 1 +; RV32-NEXT: or a2, a2, a3 +; RV32-NEXT: bnez a0, .LBB3_2 +; RV32-NEXT: # %bb.1: # %_udiv-special-cases +; RV32-NEXT: srli a3, a2, 1 +; RV32-NEXT: or a3, a2, a3 +; RV32-NEXT: srli a4, a3, 2 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 8 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 16 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: not a3, a3 +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: and a4, a4, t5 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: and a4, a3, t4 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: and a3, a3, t3 +; RV32-NEXT: slli a4, a3, 8 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a3, 16 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: srli a3, a3, 24 +; RV32-NEXT: addi a6, a3, 32 +; RV32-NEXT: j .LBB3_3 +; RV32-NEXT: .LBB3_2: +; RV32-NEXT: srli a3, a0, 1 +; RV32-NEXT: or a3, a0, a3 +; RV32-NEXT: srli a4, a3, 2 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 8 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 16 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: not a3, a3 +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: and a4, a4, t5 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: and a4, a3, t4 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: and a3, a3, t3 +; RV32-NEXT: slli a4, a3, 8 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a3, 16 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: srli a6, a3, 24 +; RV32-NEXT: .LBB3_3: # %_udiv-special-cases +; RV32-NEXT: lw a7, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: srli a3, a7, 1 +; RV32-NEXT: slli a5, t2, 31 +; RV32-NEXT: slli a7, a7, 31 +; RV32-NEXT: lw a4, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: srli t0, a4, 1 +; RV32-NEXT: lw a4, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: slli a4, a4, 31 +; RV32-NEXT: li s2, 64 +; RV32-NEXT: bnez a4, .LBB3_5 +; RV32-NEXT: # %bb.4: # %_udiv-special-cases +; RV32-NEXT: li t6, 64 +; RV32-NEXT: j .LBB3_6 +; RV32-NEXT: .LBB3_5: +; RV32-NEXT: srli t1, a4, 1 +; RV32-NEXT: or t1, a4, t1 +; RV32-NEXT: srli t6, t1, 2 +; RV32-NEXT: or t1, t1, t6 +; RV32-NEXT: srli t6, t1, 4 +; RV32-NEXT: or t1, t1, t6 +; RV32-NEXT: srli t6, t1, 8 +; RV32-NEXT: or t1, t1, t6 +; RV32-NEXT: srli t6, t1, 16 +; RV32-NEXT: or t1, t1, t6 +; RV32-NEXT: not t1, t1 +; RV32-NEXT: srli t6, t1, 1 +; RV32-NEXT: and t6, t6, t5 +; RV32-NEXT: sub t1, t1, t6 +; RV32-NEXT: and t6, t1, t4 +; RV32-NEXT: srli t1, t1, 2 +; RV32-NEXT: and t1, t1, t4 +; RV32-NEXT: add t1, t6, t1 +; RV32-NEXT: srli t6, t1, 4 +; RV32-NEXT: add t1, t1, t6 +; RV32-NEXT: and t1, t1, t3 +; RV32-NEXT: slli t6, t1, 8 +; RV32-NEXT: add t1, t1, t6 +; RV32-NEXT: slli t6, t1, 16 +; RV32-NEXT: add t1, t1, t6 +; RV32-NEXT: srli t6, t1, 24 +; RV32-NEXT: .LBB3_6: # %_udiv-special-cases +; RV32-NEXT: or t1, a5, a3 +; RV32-NEXT: or a7, t0, a7 +; RV32-NEXT: bnez a4, .LBB3_8 +; RV32-NEXT: # %bb.7: # %_udiv-special-cases +; RV32-NEXT: li t6, 128 +; RV32-NEXT: .LBB3_8: # %_udiv-special-cases +; RV32-NEXT: or a5, a7, t1 +; RV32-NEXT: addi a4, a6, 64 +; RV32-NEXT: addi a3, t6, 128 +; RV32-NEXT: or a0, a0, t1 +; RV32-NEXT: or a2, a2, a7 +; RV32-NEXT: or s3, a2, a0 +; RV32-NEXT: sltu s0, a3, t6 +; RV32-NEXT: bnez s3, .LBB3_11 +; RV32-NEXT: # %bb.9: # %_udiv-special-cases +; RV32-NEXT: mv t6, s0 +; RV32-NEXT: beqz t1, .LBB3_12 +; RV32-NEXT: .LBB3_10: +; RV32-NEXT: srli a0, t1, 1 +; RV32-NEXT: or a0, t1, a0 +; RV32-NEXT: srli a2, a0, 2 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a2, a0, 4 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a2, a0, 8 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a2, a0, 16 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: srli a2, a0, 1 +; RV32-NEXT: and a2, a2, t5 +; RV32-NEXT: sub a0, a0, a2 +; RV32-NEXT: and a2, a0, t4 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: and a0, a0, t4 +; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: srli a2, a0, 4 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: and a0, a0, t3 +; RV32-NEXT: slli a2, a0, 8 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: slli a2, a0, 16 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: srli s1, a0, 24 +; RV32-NEXT: beqz a5, .LBB3_13 +; RV32-NEXT: j .LBB3_14 +; RV32-NEXT: .LBB3_11: +; RV32-NEXT: snez a0, a5 +; RV32-NEXT: sltu a2, a4, a6 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and t6, a0, a2 +; RV32-NEXT: bnez t1, .LBB3_10 +; RV32-NEXT: .LBB3_12: # %_udiv-special-cases +; RV32-NEXT: srli a0, a7, 1 +; RV32-NEXT: or a0, a7, a0 +; RV32-NEXT: srli a2, a0, 2 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a2, a0, 4 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a2, a0, 8 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a2, a0, 16 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: srli a2, a0, 1 +; RV32-NEXT: and a2, a2, t5 +; RV32-NEXT: sub a0, a0, a2 +; RV32-NEXT: and a2, a0, t4 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: and a0, a0, t4 +; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: srli a2, a0, 4 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: and a0, a0, t3 +; RV32-NEXT: slli a2, a0, 8 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: slli a2, a0, 16 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: addi s1, a0, 32 +; RV32-NEXT: bnez a5, .LBB3_14 +; RV32-NEXT: .LBB3_13: # %_udiv-special-cases +; RV32-NEXT: mv s1, a4 +; RV32-NEXT: .LBB3_14: # %_udiv-special-cases +; RV32-NEXT: lw a7, 0(a1) +; RV32-NEXT: lw t0, 4(a1) +; RV32-NEXT: lw a6, 8(a1) +; RV32-NEXT: bnez s3, .LBB3_16 +; RV32-NEXT: # %bb.15: # %_udiv-special-cases +; RV32-NEXT: mv s1, a3 +; RV32-NEXT: .LBB3_16: # %_udiv-special-cases +; RV32-NEXT: lw t1, 12(a1) +; RV32-NEXT: lw a1, 16(a1) +; RV32-NEXT: slli a0, a6, 31 +; RV32-NEXT: srli a2, t0, 1 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: slli a2, t0, 31 +; RV32-NEXT: srli a3, a7, 1 +; RV32-NEXT: or a2, a3, a2 +; RV32-NEXT: bnez a0, .LBB3_18 +; RV32-NEXT: # %bb.17: # %_udiv-special-cases +; RV32-NEXT: srli a3, a2, 1 +; RV32-NEXT: or a3, a2, a3 +; RV32-NEXT: srli a4, a3, 2 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 8 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 16 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: not a3, a3 +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: and a4, a4, t5 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: and a4, a3, t4 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: and a3, a3, t3 +; RV32-NEXT: slli a4, a3, 8 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a3, 16 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: srli a3, a3, 24 +; RV32-NEXT: addi s5, a3, 32 +; RV32-NEXT: j .LBB3_19 +; RV32-NEXT: .LBB3_18: +; RV32-NEXT: srli a3, a0, 1 +; RV32-NEXT: or a3, a0, a3 +; RV32-NEXT: srli a4, a3, 2 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 8 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 16 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: not a3, a3 +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: and a4, a4, t5 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: and a4, a3, t4 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: and a3, a3, t3 +; RV32-NEXT: slli a4, a3, 8 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a3, 16 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: srli s5, a3, 24 +; RV32-NEXT: .LBB3_19: # %_udiv-special-cases +; RV32-NEXT: srli a3, t1, 1 +; RV32-NEXT: slli a4, a1, 31 +; RV32-NEXT: slli a5, t1, 31 +; RV32-NEXT: slli s4, a7, 31 +; RV32-NEXT: srli s6, a6, 1 +; RV32-NEXT: beqz s4, .LBB3_21 +; RV32-NEXT: # %bb.20: +; RV32-NEXT: srli s2, s4, 1 +; RV32-NEXT: or s2, s4, s2 +; RV32-NEXT: srli s7, s2, 2 +; RV32-NEXT: or s2, s2, s7 +; RV32-NEXT: srli s7, s2, 4 +; RV32-NEXT: or s2, s2, s7 +; RV32-NEXT: srli s7, s2, 8 +; RV32-NEXT: or s2, s2, s7 +; RV32-NEXT: srli s7, s2, 16 +; RV32-NEXT: or s2, s2, s7 +; RV32-NEXT: not s2, s2 +; RV32-NEXT: srli s7, s2, 1 +; RV32-NEXT: and s7, s7, t5 +; RV32-NEXT: sub s2, s2, s7 +; RV32-NEXT: and s7, s2, t4 +; RV32-NEXT: srli s2, s2, 2 +; RV32-NEXT: and s2, s2, t4 +; RV32-NEXT: add s2, s7, s2 +; RV32-NEXT: srli s7, s2, 4 +; RV32-NEXT: add s2, s2, s7 +; RV32-NEXT: and s2, s2, t3 +; RV32-NEXT: slli s7, s2, 8 +; RV32-NEXT: add s2, s2, s7 +; RV32-NEXT: slli s7, s2, 16 +; RV32-NEXT: add s2, s2, s7 +; RV32-NEXT: srli s2, s2, 24 +; RV32-NEXT: .LBB3_21: # %_udiv-special-cases +; RV32-NEXT: or s7, a4, a3 +; RV32-NEXT: or s6, s6, a5 +; RV32-NEXT: bnez s4, .LBB3_23 +; RV32-NEXT: # %bb.22: # %_udiv-special-cases +; RV32-NEXT: li s2, 128 +; RV32-NEXT: .LBB3_23: # %_udiv-special-cases +; RV32-NEXT: or s4, s6, s7 +; RV32-NEXT: addi a5, s5, 64 +; RV32-NEXT: addi a3, s2, 128 +; RV32-NEXT: or a0, a0, s7 +; RV32-NEXT: or a4, a2, s6 +; RV32-NEXT: or a4, a4, a0 +; RV32-NEXT: sltu a0, a3, s2 +; RV32-NEXT: bnez a4, .LBB3_26 +; RV32-NEXT: # %bb.24: # %_udiv-special-cases +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: snez s2, s3 +; RV32-NEXT: beqz s7, .LBB3_27 +; RV32-NEXT: .LBB3_25: +; RV32-NEXT: srli s3, s7, 1 +; RV32-NEXT: or s3, s7, s3 +; RV32-NEXT: srli s5, s3, 2 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli s5, s3, 4 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli s5, s3, 8 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli s5, s3, 16 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: not s3, s3 +; RV32-NEXT: srli s5, s3, 1 +; RV32-NEXT: and t5, s5, t5 +; RV32-NEXT: sub t5, s3, t5 +; RV32-NEXT: and s3, t5, t4 +; RV32-NEXT: srli t5, t5, 2 +; RV32-NEXT: and t4, t5, t4 +; RV32-NEXT: add t4, s3, t4 +; RV32-NEXT: srli t5, t4, 4 +; RV32-NEXT: add t4, t4, t5 +; RV32-NEXT: and t3, t4, t3 +; RV32-NEXT: slli t4, t3, 8 +; RV32-NEXT: add t3, t3, t4 +; RV32-NEXT: slli t4, t3, 16 +; RV32-NEXT: add t3, t3, t4 +; RV32-NEXT: srli t3, t3, 24 +; RV32-NEXT: j .LBB3_28 +; RV32-NEXT: .LBB3_26: +; RV32-NEXT: snez a2, s4 +; RV32-NEXT: sltu s2, a5, s5 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a2, a2, s2 +; RV32-NEXT: snez s2, s3 +; RV32-NEXT: bnez s7, .LBB3_25 +; RV32-NEXT: .LBB3_27: # %_udiv-special-cases +; RV32-NEXT: srli s3, s6, 1 +; RV32-NEXT: or s3, s6, s3 +; RV32-NEXT: srli s5, s3, 2 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli s5, s3, 4 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli s5, s3, 8 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli s5, s3, 16 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: not s3, s3 +; RV32-NEXT: srli s5, s3, 1 +; RV32-NEXT: and t5, s5, t5 +; RV32-NEXT: sub t5, s3, t5 +; RV32-NEXT: and s3, t5, t4 +; RV32-NEXT: srli t5, t5, 2 +; RV32-NEXT: and t4, t5, t4 +; RV32-NEXT: add t4, s3, t4 +; RV32-NEXT: srli t5, t4, 4 +; RV32-NEXT: add t4, t4, t5 +; RV32-NEXT: and t3, t4, t3 +; RV32-NEXT: slli t4, t3, 8 +; RV32-NEXT: add t3, t3, t4 +; RV32-NEXT: slli t4, t3, 16 +; RV32-NEXT: add t3, t3, t4 +; RV32-NEXT: srli t3, t3, 24 +; RV32-NEXT: addi t3, t3, 32 +; RV32-NEXT: .LBB3_28: # %_udiv-special-cases +; RV32-NEXT: xori t4, s0, 1 +; RV32-NEXT: addi s2, s2, -1 +; RV32-NEXT: bnez s4, .LBB3_30 +; RV32-NEXT: # %bb.29: # %_udiv-special-cases +; RV32-NEXT: mv t3, a5 +; RV32-NEXT: .LBB3_30: # %_udiv-special-cases +; RV32-NEXT: andi s11, a1, 1 +; RV32-NEXT: andi s8, t2, 1 +; RV32-NEXT: lw a1, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: lw a5, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: or s9, a1, a5 +; RV32-NEXT: or t2, a7, a6 +; RV32-NEXT: neg a1, t4 +; RV32-NEXT: and s0, s2, s0 +; RV32-NEXT: bnez a4, .LBB3_32 +; RV32-NEXT: # %bb.31: # %_udiv-special-cases +; RV32-NEXT: mv t3, a3 +; RV32-NEXT: .LBB3_32: # %_udiv-special-cases +; RV32-NEXT: lw a3, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw a5, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: or s10, a3, a5 +; RV32-NEXT: or a5, s9, s8 +; RV32-NEXT: or t4, t0, t1 +; RV32-NEXT: or t5, t2, s11 +; RV32-NEXT: and a1, s0, a1 +; RV32-NEXT: xori a3, a0, 1 +; RV32-NEXT: snez a4, a4 +; RV32-NEXT: neg a3, a3 +; RV32-NEXT: addi a4, a4, -1 +; RV32-NEXT: and a0, a4, a0 +; RV32-NEXT: sltu a4, s1, t3 +; RV32-NEXT: and t2, a0, a3 +; RV32-NEXT: mv a3, a4 +; RV32-NEXT: beq t6, a2, .LBB3_34 +; RV32-NEXT: # %bb.33: # %_udiv-special-cases +; RV32-NEXT: sltu a3, t6, a2 +; RV32-NEXT: .LBB3_34: # %_udiv-special-cases +; RV32-NEXT: or a0, a5, s10 +; RV32-NEXT: or t5, t5, t4 +; RV32-NEXT: sltu t4, a1, t2 +; RV32-NEXT: mv s0, a3 +; RV32-NEXT: beq a1, t2, .LBB3_36 +; RV32-NEXT: # %bb.35: # %_udiv-special-cases +; RV32-NEXT: mv s0, t4 +; RV32-NEXT: .LBB3_36: # %_udiv-special-cases +; RV32-NEXT: seqz a5, a0 +; RV32-NEXT: seqz t5, t5 +; RV32-NEXT: andi a0, s0, 1 +; RV32-NEXT: sub a2, t6, a2 +; RV32-NEXT: sub a1, a1, t2 +; RV32-NEXT: sub t2, a2, a4 +; RV32-NEXT: sltu a2, a1, a3 +; RV32-NEXT: add a2, t4, a2 +; RV32-NEXT: neg t4, a2 +; RV32-NEXT: sub a4, a1, a3 +; RV32-NEXT: or a1, a4, t4 +; RV32-NEXT: sub a3, s1, t3 +; RV32-NEXT: beqz a1, .LBB3_38 +; RV32-NEXT: # %bb.37: # %_udiv-special-cases +; RV32-NEXT: snez a1, a1 +; RV32-NEXT: or a2, a5, t5 +; RV32-NEXT: bnez a0, .LBB3_39 +; RV32-NEXT: j .LBB3_40 +; RV32-NEXT: .LBB3_38: +; RV32-NEXT: snez a1, t2 +; RV32-NEXT: sltiu a2, a3, 129 +; RV32-NEXT: xori a2, a2, 1 +; RV32-NEXT: or a1, a2, a1 +; RV32-NEXT: or a2, a5, t5 +; RV32-NEXT: beqz a0, .LBB3_40 +; RV32-NEXT: .LBB3_39: # %_udiv-special-cases +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: .LBB3_40: # %_udiv-special-cases +; RV32-NEXT: or t6, a2, a1 +; RV32-NEXT: addi a1, t6, -1 +; RV32-NEXT: and a2, s11, a1 +; RV32-NEXT: and a5, a1, t1 +; RV32-NEXT: and t3, a1, a6 +; RV32-NEXT: and t5, a1, t0 +; RV32-NEXT: and a1, a1, a7 +; RV32-NEXT: bnez t6, .LBB3_57 +; RV32-NEXT: # %bb.41: # %_udiv-special-cases +; RV32-NEXT: or t6, t2, t4 +; RV32-NEXT: xori s0, a3, 128 +; RV32-NEXT: or s0, s0, a0 +; RV32-NEXT: or s0, s0, a4 +; RV32-NEXT: or t6, s0, t6 +; RV32-NEXT: beqz t6, .LBB3_57 +; RV32-NEXT: # %bb.42: # %udiv-bb1 +; RV32-NEXT: sw ra, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: addi a1, a3, 1 +; RV32-NEXT: sw zero, 136(sp) +; RV32-NEXT: sw zero, 140(sp) +; RV32-NEXT: sw zero, 144(sp) +; RV32-NEXT: sw zero, 148(sp) +; RV32-NEXT: sw zero, 120(sp) +; RV32-NEXT: sw zero, 124(sp) +; RV32-NEXT: sw zero, 128(sp) +; RV32-NEXT: sw zero, 132(sp) +; RV32-NEXT: sw a7, 152(sp) +; RV32-NEXT: sw t0, 156(sp) +; RV32-NEXT: sw a6, 160(sp) +; RV32-NEXT: sw t1, 164(sp) +; RV32-NEXT: sw s11, 168(sp) +; RV32-NEXT: li a5, 128 +; RV32-NEXT: addi t3, sp, 152 +; RV32-NEXT: neg a2, a3 +; RV32-NEXT: seqz t5, a1 +; RV32-NEXT: sub a5, a5, a3 +; RV32-NEXT: add t2, t2, t5 +; RV32-NEXT: andi a3, a5, 31 +; RV32-NEXT: srli t5, a5, 3 +; RV32-NEXT: or t6, a1, t2 +; RV32-NEXT: xori a5, a3, 31 +; RV32-NEXT: andi a3, t5, 28 +; RV32-NEXT: seqz t6, t6 +; RV32-NEXT: sub ra, t3, a3 +; RV32-NEXT: add t6, a4, t6 +; RV32-NEXT: lw t3, 0(ra) +; RV32-NEXT: lw s0, 4(ra) +; RV32-NEXT: lw s1, 8(ra) +; RV32-NEXT: lw a3, 12(ra) +; RV32-NEXT: sltu a4, t6, a4 +; RV32-NEXT: or t5, a1, t6 +; RV32-NEXT: add t4, t4, a4 +; RV32-NEXT: or a4, t2, t4 +; RV32-NEXT: or a4, t5, a4 +; RV32-NEXT: srli t5, s1, 1 +; RV32-NEXT: seqz s2, a4 +; RV32-NEXT: add a0, a0, s2 +; RV32-NEXT: sll s2, a3, a2 +; RV32-NEXT: srl t5, t5, a5 +; RV32-NEXT: or t5, s2, t5 +; RV32-NEXT: srli s2, s0, 1 +; RV32-NEXT: sll s1, s1, a2 +; RV32-NEXT: srl s2, s2, a5 +; RV32-NEXT: or s2, s1, s2 +; RV32-NEXT: srli s1, t3, 1 +; RV32-NEXT: sll s0, s0, a2 +; RV32-NEXT: srl s1, s1, a5 +; RV32-NEXT: andi s3, a0, 1 +; RV32-NEXT: or s1, s0, s1 +; RV32-NEXT: or a0, a4, s3 +; RV32-NEXT: sll t3, t3, a2 +; RV32-NEXT: beqz a0, .LBB3_55 +; RV32-NEXT: # %bb.43: # %udiv-preheader +; RV32-NEXT: sw zero, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: sw zero, 48(sp) # 4-byte Folded Spill +; RV32-NEXT: sw zero, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw zero, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: li s7, 0 +; RV32-NEXT: srli a3, a3, 1 +; RV32-NEXT: lw a0, 16(ra) +; RV32-NEXT: sw zero, 104(sp) +; RV32-NEXT: sw zero, 108(sp) +; RV32-NEXT: sw zero, 112(sp) +; RV32-NEXT: sw zero, 116(sp) +; RV32-NEXT: sw zero, 88(sp) +; RV32-NEXT: sw zero, 92(sp) +; RV32-NEXT: sw zero, 96(sp) +; RV32-NEXT: sw zero, 100(sp) +; RV32-NEXT: sw s11, 72(sp) +; RV32-NEXT: sw zero, 76(sp) +; RV32-NEXT: sw zero, 80(sp) +; RV32-NEXT: sw zero, 84(sp) +; RV32-NEXT: sw a7, 56(sp) +; RV32-NEXT: sw t0, 60(sp) +; RV32-NEXT: sw a6, 64(sp) +; RV32-NEXT: sw t1, 68(sp) +; RV32-NEXT: srli a4, a1, 3 +; RV32-NEXT: addi a6, sp, 56 +; RV32-NEXT: andi a7, a1, 31 +; RV32-NEXT: or t0, s9, s10 +; RV32-NEXT: srl a3, a3, a5 +; RV32-NEXT: andi a4, a4, 28 +; RV32-NEXT: xori a5, a7, 31 +; RV32-NEXT: snez a7, t0 +; RV32-NEXT: add a4, a6, a4 +; RV32-NEXT: add a7, s8, a7 +; RV32-NEXT: lw a6, 16(a4) +; RV32-NEXT: lw t0, 0(a4) +; RV32-NEXT: lw t1, 4(a4) +; RV32-NEXT: lw s0, 8(a4) +; RV32-NEXT: lw a4, 12(a4) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: or a3, a0, a3 +; RV32-NEXT: slli a6, a6, 1 +; RV32-NEXT: slli a0, a4, 1 +; RV32-NEXT: slli a2, s0, 1 +; RV32-NEXT: slli s4, t1, 1 +; RV32-NEXT: sll a6, a6, a5 +; RV32-NEXT: sll a0, a0, a5 +; RV32-NEXT: sll s8, a2, a5 +; RV32-NEXT: sll s4, s4, a5 +; RV32-NEXT: srl a2, a4, a1 +; RV32-NEXT: or ra, a2, a6 +; RV32-NEXT: lw a6, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: seqz a4, a6 +; RV32-NEXT: srl a2, s0, a1 +; RV32-NEXT: or a2, a2, a0 +; RV32-NEXT: lw a5, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: or a0, a6, a5 +; RV32-NEXT: sub s5, a5, a4 +; RV32-NEXT: seqz a4, a0 +; RV32-NEXT: srl a0, t1, a1 +; RV32-NEXT: or a0, a0, s8 +; RV32-NEXT: lw a5, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: sub t1, a5, a4 +; RV32-NEXT: sw t1, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: sltu a4, a5, a4 +; RV32-NEXT: addi a7, a7, 1 +; RV32-NEXT: lw a5, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: sub s6, a5, a4 +; RV32-NEXT: andi a4, a7, 1 +; RV32-NEXT: sw a4, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: andi a5, a3, 1 +; RV32-NEXT: srl a3, t0, a1 +; RV32-NEXT: or a4, a3, s4 +; RV32-NEXT: addi a6, a6, -1 +; RV32-NEXT: sw a6, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li s11, 0 +; RV32-NEXT: li s10, 0 +; RV32-NEXT: j .LBB3_45 +; RV32-NEXT: .LBB3_44: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: and s0, a5, s0 +; RV32-NEXT: xor s8, t1, a7 +; RV32-NEXT: xor s9, a2, s0 +; RV32-NEXT: or s8, s9, s8 +; RV32-NEXT: li s9, 0 +; RV32-NEXT: li s8, 0 +; RV32-NEXT: sltu s4, a2, s0 +; RV32-NEXT: sub s0, a2, s0 +; RV32-NEXT: sub a7, t1, a7 +; RV32-NEXT: srli a2, s2, 31 +; RV32-NEXT: sub a0, a0, t0 +; RV32-NEXT: slli t0, t5, 1 +; RV32-NEXT: or t0, t0, a2 +; RV32-NEXT: srli a2, s1, 31 +; RV32-NEXT: slli s2, s2, 1 +; RV32-NEXT: or t1, s2, a2 +; RV32-NEXT: srli a2, t3, 31 +; RV32-NEXT: slli s1, s1, 1 +; RV32-NEXT: or s1, s1, a2 +; RV32-NEXT: slli t3, t3, 1 +; RV32-NEXT: lw a2, 52(sp) # 4-byte Folded Reload +; RV32-NEXT: or t3, a2, t3 +; RV32-NEXT: srli a2, t5, 31 +; RV32-NEXT: or s7, s7, a2 +; RV32-NEXT: sub a2, s0, ra +; RV32-NEXT: sltu s0, s0, ra +; RV32-NEXT: or t5, a1, t6 +; RV32-NEXT: sub a7, a7, s4 +; RV32-NEXT: or s2, t2, t4 +; RV32-NEXT: sub a0, a0, a6 +; RV32-NEXT: or a6, a1, t2 +; RV32-NEXT: or s4, t5, s2 +; RV32-NEXT: seqz t5, a1 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: andi a5, a5, 1 +; RV32-NEXT: sw a5, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: seqz a6, a6 +; RV32-NEXT: sub t2, t2, t5 +; RV32-NEXT: lw a5, 48(sp) # 4-byte Folded Reload +; RV32-NEXT: or s1, a5, s1 +; RV32-NEXT: lw a5, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: or s2, a5, t1 +; RV32-NEXT: lw a5, 40(sp) # 4-byte Folded Reload +; RV32-NEXT: or t5, a5, t0 +; RV32-NEXT: andi a5, s7, 1 +; RV32-NEXT: sub ra, a7, s0 +; RV32-NEXT: snez a7, s4 +; RV32-NEXT: sltu t0, t6, a6 +; RV32-NEXT: sub t6, t6, a6 +; RV32-NEXT: add a7, s3, a7 +; RV32-NEXT: sub t4, t4, t0 +; RV32-NEXT: or a6, a1, t6 +; RV32-NEXT: addi a7, a7, 1 +; RV32-NEXT: or t0, t2, t4 +; RV32-NEXT: andi s3, a7, 1 +; RV32-NEXT: or a6, a6, t0 +; RV32-NEXT: or a6, a6, s3 +; RV32-NEXT: sub a4, a4, a3 +; RV32-NEXT: sw zero, 48(sp) # 4-byte Folded Spill +; RV32-NEXT: sw zero, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw zero, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: li s7, 0 +; RV32-NEXT: beqz a6, .LBB3_56 +; RV32-NEXT: .LBB3_45: # %udiv-do-while +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: srli a3, a2, 31 +; RV32-NEXT: slli a6, ra, 1 +; RV32-NEXT: or t1, a6, a3 +; RV32-NEXT: srli a3, a0, 31 +; RV32-NEXT: slli a2, a2, 1 +; RV32-NEXT: or a2, a2, a3 +; RV32-NEXT: beq s6, t1, .LBB3_47 +; RV32-NEXT: # %bb.46: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: sltu a3, s6, t1 +; RV32-NEXT: j .LBB3_48 +; RV32-NEXT: .LBB3_47: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: lw a3, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: sltu a3, a3, a2 +; RV32-NEXT: .LBB3_48: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: srli a6, a4, 31 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: slli a4, a4, 1 +; RV32-NEXT: or a0, a0, a6 +; RV32-NEXT: andi a5, a5, 1 +; RV32-NEXT: or a4, a4, a5 +; RV32-NEXT: beq s5, a0, .LBB3_50 +; RV32-NEXT: # %bb.49: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: sltu a5, s5, a0 +; RV32-NEXT: j .LBB3_51 +; RV32-NEXT: .LBB3_50: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: lw a5, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: sltu a5, a5, a4 +; RV32-NEXT: .LBB3_51: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: lw a6, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: xor a6, a6, a2 +; RV32-NEXT: xor a7, s6, t1 +; RV32-NEXT: or a6, a6, a7 +; RV32-NEXT: beqz a6, .LBB3_53 +; RV32-NEXT: # %bb.52: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: mv a5, a3 +; RV32-NEXT: .LBB3_53: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: srli a3, ra, 31 +; RV32-NEXT: lw a6, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: sub a3, a6, a3 +; RV32-NEXT: sub a3, a3, a5 +; RV32-NEXT: slli a3, a3, 31 +; RV32-NEXT: srai a5, a3, 31 +; RV32-NEXT: lw a3, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: and a7, a5, a3 +; RV32-NEXT: lw a3, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: and a3, a5, a3 +; RV32-NEXT: lw a6, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: and t0, a5, a6 +; RV32-NEXT: sltu a6, a4, a3 +; RV32-NEXT: mv ra, a6 +; RV32-NEXT: beq a0, t0, .LBB3_44 +; RV32-NEXT: # %bb.54: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: sltu ra, a0, t0 +; RV32-NEXT: j .LBB3_44 +; RV32-NEXT: .LBB3_55: +; RV32-NEXT: sw zero, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: li s11, 0 +; RV32-NEXT: li s9, 0 +; RV32-NEXT: li s10, 0 +; RV32-NEXT: li s8, 0 +; RV32-NEXT: .LBB3_56: # %udiv-loop-exit +; RV32-NEXT: srli a0, s2, 31 +; RV32-NEXT: slli a1, t5, 1 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: srli a1, s1, 31 +; RV32-NEXT: slli s2, s2, 1 +; RV32-NEXT: or a2, s2, a1 +; RV32-NEXT: srli a3, t3, 31 +; RV32-NEXT: slli s1, s1, 1 +; RV32-NEXT: srli a4, t5, 31 +; RV32-NEXT: slli t3, t3, 1 +; RV32-NEXT: lw a1, 52(sp) # 4-byte Folded Reload +; RV32-NEXT: or a1, a1, t3 +; RV32-NEXT: or a3, s11, a3 +; RV32-NEXT: or a4, s8, a4 +; RV32-NEXT: or t5, a3, s1 +; RV32-NEXT: or t3, s9, a2 +; RV32-NEXT: or a5, s10, a0 +; RV32-NEXT: andi a2, a4, 1 +; RV32-NEXT: lw ra, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: .LBB3_57: # %udiv-end +; RV32-NEXT: sw a1, 0(ra) +; RV32-NEXT: sw t5, 4(ra) +; RV32-NEXT: sw t3, 8(ra) +; RV32-NEXT: sw a5, 12(ra) +; RV32-NEXT: andi a2, a2, 1 +; RV32-NEXT: sb a2, 16(ra) +; RV32-NEXT: lw ra, 236(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 232(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 228(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 224(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 220(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 216(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 212(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 208(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 204(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s8, 200(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s9, 196(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s10, 192(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s11, 188(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 240 +; RV32-NEXT: ret +; +; RV64-LABEL: udiv_i129: +; RV64: # %bb.0: # %_udiv-special-cases +; RV64-NEXT: ld a3, 0(a2) +; RV64-NEXT: ld a4, 8(a2) +; RV64-NEXT: ld t1, 16(a2) +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a5, 209715 +; RV64-NEXT: lui a6, 61681 +; RV64-NEXT: addi t0, a2, 1365 +; RV64-NEXT: addi a7, a5, 819 +; RV64-NEXT: addi a6, a6, -241 +; RV64-NEXT: slli a2, t0, 32 +; RV64-NEXT: slli a5, a7, 32 +; RV64-NEXT: slli t2, a6, 32 +; RV64-NEXT: add t0, t0, a2 +; RV64-NEXT: add a7, a7, a5 +; RV64-NEXT: add a6, a6, t2 +; RV64-NEXT: srli a2, a4, 1 +; RV64-NEXT: slli a5, t1, 63 +; RV64-NEXT: slli t2, a4, 63 +; RV64-NEXT: or t3, a5, a2 +; RV64-NEXT: srli a2, a3, 1 +; RV64-NEXT: or t4, a2, t2 +; RV64-NEXT: bnez t3, .LBB3_2 +; RV64-NEXT: # %bb.1: # %_udiv-special-cases +; RV64-NEXT: srli a2, t4, 1 +; RV64-NEXT: or a2, t4, a2 +; RV64-NEXT: srli a5, a2, 2 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 4 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 8 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 16 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 32 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: not a2, a2 +; RV64-NEXT: srli a5, a2, 1 +; RV64-NEXT: and a5, a5, t0 +; RV64-NEXT: sub a2, a2, a5 +; RV64-NEXT: and a5, a2, a7 +; RV64-NEXT: srli a2, a2, 2 +; RV64-NEXT: and a2, a2, a7 +; RV64-NEXT: add a2, a5, a2 +; RV64-NEXT: srli a5, a2, 4 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: and a2, a2, a6 +; RV64-NEXT: slli a5, a2, 8 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a2, 16 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: srli a2, a2, 56 +; RV64-NEXT: addi t2, a2, 64 +; RV64-NEXT: j .LBB3_3 +; RV64-NEXT: .LBB3_2: +; RV64-NEXT: srli a2, t3, 1 +; RV64-NEXT: or a2, t3, a2 +; RV64-NEXT: srli a5, a2, 2 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 4 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 8 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 16 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 32 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: not a2, a2 +; RV64-NEXT: srli a5, a2, 1 +; RV64-NEXT: and a5, a5, t0 +; RV64-NEXT: sub a2, a2, a5 +; RV64-NEXT: and a5, a2, a7 +; RV64-NEXT: srli a2, a2, 2 +; RV64-NEXT: and a2, a2, a7 +; RV64-NEXT: add a2, a5, a2 +; RV64-NEXT: srli a5, a2, 4 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: and a2, a2, a6 +; RV64-NEXT: slli a5, a2, 8 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a2, 16 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: srli t2, a2, 56 +; RV64-NEXT: .LBB3_3: # %_udiv-special-cases +; RV64-NEXT: addi sp, sp, -192 +; RV64-NEXT: sd s0, 184(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 176(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 168(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 160(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 152(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s5, 144(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s6, 136(sp) # 8-byte Folded Spill +; RV64-NEXT: slli a2, a3, 63 +; RV64-NEXT: li t5, 128 +; RV64-NEXT: bnez a2, .LBB3_5 +; RV64-NEXT: # %bb.4: # %_udiv-special-cases +; RV64-NEXT: li s0, 128 +; RV64-NEXT: j .LBB3_6 +; RV64-NEXT: .LBB3_5: +; RV64-NEXT: srli a5, a2, 1 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 2 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 4 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 8 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 16 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 32 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: not a2, a2 +; RV64-NEXT: srli a5, a2, 1 +; RV64-NEXT: and a5, a5, t0 +; RV64-NEXT: sub a2, a2, a5 +; RV64-NEXT: and a5, a2, a7 +; RV64-NEXT: srli a2, a2, 2 +; RV64-NEXT: and a2, a2, a7 +; RV64-NEXT: add a2, a5, a2 +; RV64-NEXT: srli a5, a2, 4 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: and a2, a2, a6 +; RV64-NEXT: slli a5, a2, 8 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a2, 16 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: srli s0, a2, 56 +; RV64-NEXT: .LBB3_6: # %_udiv-special-cases +; RV64-NEXT: ld a5, 0(a1) +; RV64-NEXT: ld a2, 8(a1) +; RV64-NEXT: ld s2, 16(a1) +; RV64-NEXT: or a1, t4, t3 +; RV64-NEXT: addi s1, s0, 128 +; RV64-NEXT: bnez a1, .LBB3_8 +; RV64-NEXT: # %bb.7: # %_udiv-special-cases +; RV64-NEXT: mv t2, s1 +; RV64-NEXT: .LBB3_8: # %_udiv-special-cases +; RV64-NEXT: snez s3, a1 +; RV64-NEXT: srli a1, a2, 1 +; RV64-NEXT: slli t3, s2, 63 +; RV64-NEXT: slli t4, a2, 63 +; RV64-NEXT: or a1, t3, a1 +; RV64-NEXT: srli t3, a5, 1 +; RV64-NEXT: or t6, t3, t4 +; RV64-NEXT: bnez a1, .LBB3_10 +; RV64-NEXT: # %bb.9: # %_udiv-special-cases +; RV64-NEXT: srli t3, t6, 1 +; RV64-NEXT: or t3, t6, t3 +; RV64-NEXT: srli t4, t3, 2 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 4 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 8 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 16 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 32 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: not t3, t3 +; RV64-NEXT: srli t4, t3, 1 +; RV64-NEXT: and t4, t4, t0 +; RV64-NEXT: sub t3, t3, t4 +; RV64-NEXT: and t4, t3, a7 +; RV64-NEXT: srli t3, t3, 2 +; RV64-NEXT: and t3, t3, a7 +; RV64-NEXT: add t3, t4, t3 +; RV64-NEXT: srli t4, t3, 4 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: and t3, t3, a6 +; RV64-NEXT: slli t4, t3, 8 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: slli t4, t3, 16 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: slli t4, t3, 32 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: srli t3, t3, 56 +; RV64-NEXT: addi s4, t3, 64 +; RV64-NEXT: j .LBB3_11 +; RV64-NEXT: .LBB3_10: +; RV64-NEXT: srli t3, a1, 1 +; RV64-NEXT: or t3, a1, t3 +; RV64-NEXT: srli t4, t3, 2 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 4 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 8 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 16 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 32 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: not t3, t3 +; RV64-NEXT: srli t4, t3, 1 +; RV64-NEXT: and t4, t4, t0 +; RV64-NEXT: sub t3, t3, t4 +; RV64-NEXT: and t4, t3, a7 +; RV64-NEXT: srli t3, t3, 2 +; RV64-NEXT: and t3, t3, a7 +; RV64-NEXT: add t3, t4, t3 +; RV64-NEXT: srli t4, t3, 4 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: and t3, t3, a6 +; RV64-NEXT: slli t4, t3, 8 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: slli t4, t3, 16 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: slli t4, t3, 32 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: srli s4, t3, 56 +; RV64-NEXT: .LBB3_11: # %_udiv-special-cases +; RV64-NEXT: andi t4, s2, 1 +; RV64-NEXT: andi t1, t1, 1 +; RV64-NEXT: or t3, a3, a4 +; RV64-NEXT: or s2, a5, a2 +; RV64-NEXT: sltu s0, s1, s0 +; RV64-NEXT: slli s1, a5, 63 +; RV64-NEXT: addi s3, s3, -1 +; RV64-NEXT: beqz s1, .LBB3_13 +; RV64-NEXT: # %bb.12: +; RV64-NEXT: srli t5, s1, 1 +; RV64-NEXT: or t5, s1, t5 +; RV64-NEXT: srli s1, t5, 2 +; RV64-NEXT: or t5, t5, s1 +; RV64-NEXT: srli s1, t5, 4 +; RV64-NEXT: or t5, t5, s1 +; RV64-NEXT: srli s1, t5, 8 +; RV64-NEXT: or t5, t5, s1 +; RV64-NEXT: srli s1, t5, 16 +; RV64-NEXT: or t5, t5, s1 +; RV64-NEXT: srli s1, t5, 32 +; RV64-NEXT: or t5, t5, s1 +; RV64-NEXT: not t5, t5 +; RV64-NEXT: srli s1, t5, 1 +; RV64-NEXT: and t0, s1, t0 +; RV64-NEXT: sub t0, t5, t0 +; RV64-NEXT: and t5, t0, a7 +; RV64-NEXT: srli t0, t0, 2 +; RV64-NEXT: and a7, t0, a7 +; RV64-NEXT: add a7, t5, a7 +; RV64-NEXT: srli t0, a7, 4 +; RV64-NEXT: add a7, a7, t0 +; RV64-NEXT: and a6, a7, a6 +; RV64-NEXT: slli a7, a6, 8 +; RV64-NEXT: add a6, a6, a7 +; RV64-NEXT: slli a7, a6, 16 +; RV64-NEXT: add a6, a6, a7 +; RV64-NEXT: slli a7, a6, 32 +; RV64-NEXT: add a6, a6, a7 +; RV64-NEXT: srli t5, a6, 56 +; RV64-NEXT: .LBB3_13: # %_udiv-special-cases +; RV64-NEXT: or t0, t3, t1 +; RV64-NEXT: or a6, s2, t4 +; RV64-NEXT: and a7, s3, s0 +; RV64-NEXT: or t6, t6, a1 +; RV64-NEXT: addi s0, t5, 128 +; RV64-NEXT: bnez t6, .LBB3_15 +; RV64-NEXT: # %bb.14: # %_udiv-special-cases +; RV64-NEXT: mv s4, s0 +; RV64-NEXT: .LBB3_15: # %_udiv-special-cases +; RV64-NEXT: seqz a1, t0 +; RV64-NEXT: sltu t0, s0, t5 +; RV64-NEXT: snez t5, t6 +; RV64-NEXT: addi t5, t5, -1 +; RV64-NEXT: and t0, t5, t0 +; RV64-NEXT: sltu t5, t2, s4 +; RV64-NEXT: seqz a6, a6 +; RV64-NEXT: mv t6, t5 +; RV64-NEXT: beq a7, t0, .LBB3_17 +; RV64-NEXT: # %bb.16: # %_udiv-special-cases +; RV64-NEXT: sltu t6, a7, t0 +; RV64-NEXT: .LBB3_17: # %_udiv-special-cases +; RV64-NEXT: or a1, a1, a6 +; RV64-NEXT: andi a6, t6, 1 +; RV64-NEXT: sub a7, a7, t0 +; RV64-NEXT: sub t5, a7, t5 +; RV64-NEXT: sub a7, t2, s4 +; RV64-NEXT: beqz a6, .LBB3_19 +; RV64-NEXT: # %bb.18: # %_udiv-special-cases +; RV64-NEXT: mv t0, a6 +; RV64-NEXT: j .LBB3_20 +; RV64-NEXT: .LBB3_19: +; RV64-NEXT: sltiu t0, a7, 129 +; RV64-NEXT: xori t0, t0, 1 +; RV64-NEXT: snez t2, t5 +; RV64-NEXT: or t0, t0, t2 +; RV64-NEXT: .LBB3_20: # %_udiv-special-cases +; RV64-NEXT: or t6, a1, t0 +; RV64-NEXT: addi a1, t6, -1 +; RV64-NEXT: and t2, t4, a1 +; RV64-NEXT: and t0, a1, a2 +; RV64-NEXT: and a1, a1, a5 +; RV64-NEXT: bnez t6, .LBB3_30 +; RV64-NEXT: # %bb.21: # %_udiv-special-cases +; RV64-NEXT: xori t6, a7, 128 +; RV64-NEXT: or t6, t6, a6 +; RV64-NEXT: or t6, t6, t5 +; RV64-NEXT: beqz t6, .LBB3_30 +; RV64-NEXT: # %bb.22: # %udiv-bb1 +; RV64-NEXT: addi a1, a7, 1 +; RV64-NEXT: sd zero, 64(sp) +; RV64-NEXT: sd zero, 72(sp) +; RV64-NEXT: sd zero, 80(sp) +; RV64-NEXT: sd zero, 88(sp) +; RV64-NEXT: sd a5, 96(sp) +; RV64-NEXT: sd a2, 104(sp) +; RV64-NEXT: sd t4, 112(sp) +; RV64-NEXT: li t0, 128 +; RV64-NEXT: addi t2, sp, 96 +; RV64-NEXT: neg s1, a7 +; RV64-NEXT: seqz t6, a1 +; RV64-NEXT: sub a7, t0, a7 +; RV64-NEXT: add t5, t5, t6 +; RV64-NEXT: andi t0, a7, 63 +; RV64-NEXT: srli a7, a7, 3 +; RV64-NEXT: or t6, a1, t5 +; RV64-NEXT: xori s2, t0, 63 +; RV64-NEXT: andi a7, a7, 24 +; RV64-NEXT: seqz t0, t6 +; RV64-NEXT: sub s3, t2, a7 +; RV64-NEXT: add a6, a6, t0 +; RV64-NEXT: ld t2, 0(s3) +; RV64-NEXT: ld s4, 8(s3) +; RV64-NEXT: andi a7, a6, 1 +; RV64-NEXT: or t6, t6, a7 +; RV64-NEXT: srli a6, t2, 1 +; RV64-NEXT: sll t0, s4, s1 +; RV64-NEXT: srl a6, a6, s2 +; RV64-NEXT: or t0, t0, a6 +; RV64-NEXT: sll a6, t2, s1 +; RV64-NEXT: li t2, 0 +; RV64-NEXT: beqz t6, .LBB3_28 +; RV64-NEXT: # %bb.23: # %udiv-preheader +; RV64-NEXT: li t6, 0 +; RV64-NEXT: li s0, 0 +; RV64-NEXT: srli s4, s4, 1 +; RV64-NEXT: ld s3, 16(s3) +; RV64-NEXT: sd zero, 32(sp) +; RV64-NEXT: sd zero, 40(sp) +; RV64-NEXT: sd zero, 48(sp) +; RV64-NEXT: sd zero, 56(sp) +; RV64-NEXT: sd a5, 0(sp) +; RV64-NEXT: sd a2, 8(sp) +; RV64-NEXT: sd t4, 16(sp) +; RV64-NEXT: sd zero, 24(sp) +; RV64-NEXT: srli a2, a1, 3 +; RV64-NEXT: srl a5, s4, s2 +; RV64-NEXT: mv t4, sp +; RV64-NEXT: snez t3, t3 +; RV64-NEXT: andi a2, a2, 24 +; RV64-NEXT: add t1, t1, t3 +; RV64-NEXT: add a2, t4, a2 +; RV64-NEXT: ld t3, 0(a2) +; RV64-NEXT: ld t4, 8(a2) +; RV64-NEXT: ld a2, 16(a2) +; RV64-NEXT: sll s1, s3, s1 +; RV64-NEXT: andi s2, a1, 63 +; RV64-NEXT: xori s2, s2, 63 +; RV64-NEXT: or s3, s1, a5 +; RV64-NEXT: slli a2, a2, 1 +; RV64-NEXT: slli a5, t4, 1 +; RV64-NEXT: sll a2, a2, s2 +; RV64-NEXT: sll s2, a5, s2 +; RV64-NEXT: srl s1, t4, a1 +; RV64-NEXT: or s1, s1, a2 +; RV64-NEXT: seqz a2, a3 +; RV64-NEXT: sub a2, a4, a2 +; RV64-NEXT: addi a5, t1, 1 +; RV64-NEXT: andi a5, a5, 1 +; RV64-NEXT: andi s3, s3, 1 +; RV64-NEXT: srl t1, t3, a1 +; RV64-NEXT: or s2, t1, s2 +; RV64-NEXT: addi t1, a3, -1 +; RV64-NEXT: j .LBB3_26 +; RV64-NEXT: .LBB3_24: # %udiv-do-while +; RV64-NEXT: # in Loop: Header=BB3_26 Depth=1 +; RV64-NEXT: sltu t3, a2, s4 +; RV64-NEXT: .LBB3_25: # %udiv-do-while +; RV64-NEXT: # in Loop: Header=BB3_26 Depth=1 +; RV64-NEXT: srli s1, s1, 63 +; RV64-NEXT: sub t4, a5, s1 +; RV64-NEXT: sub t3, t4, t3 +; RV64-NEXT: slli t3, t3, 63 +; RV64-NEXT: srai s1, t3, 63 +; RV64-NEXT: and s3, s1, a4 +; RV64-NEXT: li t3, 0 +; RV64-NEXT: li t4, 0 +; RV64-NEXT: srli s5, a6, 63 +; RV64-NEXT: sub s4, s4, s3 +; RV64-NEXT: slli s3, t0, 1 +; RV64-NEXT: or s3, s3, s5 +; RV64-NEXT: srli t0, t0, 63 +; RV64-NEXT: slli a6, a6, 1 +; RV64-NEXT: or a6, t2, a6 +; RV64-NEXT: seqz t2, a1 +; RV64-NEXT: or s0, s0, t0 +; RV64-NEXT: or s5, a1, t5 +; RV64-NEXT: sub t5, t5, t2 +; RV64-NEXT: and s6, s1, a3 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: andi t2, s1, 1 +; RV64-NEXT: or t0, t6, s3 +; RV64-NEXT: sltu t6, s2, s6 +; RV64-NEXT: snez s5, s5 +; RV64-NEXT: andi s3, s0, 1 +; RV64-NEXT: sub s1, s4, t6 +; RV64-NEXT: add a7, a7, s5 +; RV64-NEXT: addi a7, a7, 1 +; RV64-NEXT: andi a7, a7, 1 +; RV64-NEXT: or t6, a1, t5 +; RV64-NEXT: or s4, t6, a7 +; RV64-NEXT: sub s2, s2, s6 +; RV64-NEXT: li t6, 0 +; RV64-NEXT: li s0, 0 +; RV64-NEXT: beqz s4, .LBB3_29 +; RV64-NEXT: .LBB3_26: # %udiv-do-while +; RV64-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-NEXT: srli t3, s2, 63 +; RV64-NEXT: slli t4, s1, 1 +; RV64-NEXT: slli s2, s2, 1 +; RV64-NEXT: or s4, t4, t3 +; RV64-NEXT: andi t3, s3, 1 +; RV64-NEXT: or s2, s2, t3 +; RV64-NEXT: bne a2, s4, .LBB3_24 +; RV64-NEXT: # %bb.27: # in Loop: Header=BB3_26 Depth=1 +; RV64-NEXT: sltu t3, t1, s2 +; RV64-NEXT: j .LBB3_25 +; RV64-NEXT: .LBB3_28: +; RV64-NEXT: li t3, 0 +; RV64-NEXT: li t4, 0 +; RV64-NEXT: .LBB3_29: # %udiv-loop-exit +; RV64-NEXT: srli a2, a6, 63 +; RV64-NEXT: slli a3, t0, 1 +; RV64-NEXT: srli a4, t0, 63 +; RV64-NEXT: slli a6, a6, 1 +; RV64-NEXT: or a1, t2, a6 +; RV64-NEXT: or a2, t3, a2 +; RV64-NEXT: or a4, t4, a4 +; RV64-NEXT: or t0, a2, a3 +; RV64-NEXT: andi t2, a4, 1 +; RV64-NEXT: .LBB3_30: # %udiv-end +; RV64-NEXT: andi a2, t2, 1 +; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: sd t0, 8(a0) +; RV64-NEXT: sb a2, 16(a0) +; RV64-NEXT: ld s0, 184(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 176(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 168(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 160(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s4, 152(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s5, 144(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s6, 136(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 192 +; RV64-NEXT: ret %res = udiv i129 %x, %y ret i129 %res } diff --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll index 04a2268..314e1b4 100644 --- a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll +++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll @@ -1,5 +1,6 @@ ; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s ; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH +; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH target triple = "wasm32" @@ -23,6 +24,10 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n ; MAX-BANDWIDTH: i32x4.add ; MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s + entry: %cmp7.not = icmp eq i32 %N, 0 br i1 %cmp7.not, label %for.cond.cleanup, label %for.body @@ -47,6 +52,109 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } +define hidden i32 @i32_mac_u8_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: i32_mac_u8_s8: +; CHECK: loop +; CHECK: v128.load32_zero +; CHECK: i16x8.extend_low_i8x16_u +; CHECK: i32x4.extend_low_i16x8_u +; CHECK: v128.load32_zero +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: i32x4.mul +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.mul +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.mul +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.mul +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.mul +; MAX-BANDWIDTH: i32x4.add + +; RELAXED-MAX-BANDWIDTH: loop +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i32x4.add +entry: + %cmp7.not = icmp eq i32 %N, 0 + br i1 %cmp7.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sext i8 %0 to i32 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09 + %1 = load i8, ptr %arrayidx1, align 1 + %conv2 = zext i8 %1 to i32 + %mul = mul nsw i32 %conv2, %conv + %add = add nsw i32 %mul, %res.08 + %inc = add nuw i32 %i.09, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { ; CHECK-LABEL: i32_mac_s16: ; CHECK: i32x4.load16x4_s 0:p2align=1 @@ -57,6 +165,12 @@ define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture ; MAX-BANDWIDTH: v128.load ; MAX-BANDWIDTH: v128.load ; MAX-BANDWIDTH: i32x4.dot_i16x8_s +; MAX-BANDWIDTH: i32x4.add + +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.add entry: %cmp7.not = icmp eq i32 %N, 0 @@ -116,6 +230,31 @@ define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s ; MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s +; RELAXED-MAX-BANDWIDTH: i64x2.add + entry: %cmp7.not = icmp eq i32 %N, 0 br i1 %cmp7.not, label %for.cond.cleanup, label %for.body @@ -156,6 +295,14 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s ; MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_s +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_s +; RELAXED-MAX-BANDWIDTH: i64x2.add + entry: %cmp6.not = icmp eq i32 %N, 0 br i1 %cmp6.not, label %for.cond.cleanup, label %for.body @@ -197,6 +344,15 @@ define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture n ; MAX-BANDWIDTH: i32x4.add ; MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u +; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u +; RELAXED-MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u +; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u +; RELAXED-MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: i32x4.add + entry: %cmp7.not = icmp eq i32 %N, 0 br i1 %cmp7.not, label %for.cond.cleanup, label %for.body @@ -235,6 +391,13 @@ define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture ; MAX-BANDWIDTH: i32x4.add ; MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u +; RELAXED-MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: i32x4.add + entry: %cmp7.not = icmp eq i32 %N, 0 br i1 %cmp7.not, label %for.cond.cleanup, label %for.body @@ -277,6 +440,17 @@ define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapt ; MAX-BANDWIDTH: i32x4.add ; MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: i32x4.add + entry: %cmp7.not = icmp eq i32 %N, 0 br i1 %cmp7.not, label %for.cond.cleanup, label %for.body @@ -335,6 +509,32 @@ define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u ; MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u +; RELAXED-MAX-BANDWIDTH: i64x2.add + + entry: %cmp8.not = icmp eq i32 %N, 0 br i1 %cmp8.not, label %for.cond.cleanup, label %for.body @@ -375,6 +575,14 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u ; MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_u +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_u +; RELAXED-MAX-BANDWIDTH: i64x2.add + entry: %cmp6.not = icmp eq i32 %N, 0 br i1 %cmp6.not, label %for.cond.cleanup, label %for.body diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll new file mode 100644 index 0000000..9716cbe --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s + +target triple = "wasm32" +; relaxed_dot stands for relaxed_dot_i8x16_i7x16_s, as in td +; relaxed_dot_add stands for i32x4.relaxed_dot_i8x16_i7x16_add_s, as in td + +define <8 x i16> @relaxed_dot_sext_1(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: relaxed_dot_sext_1: +; CHECK: .functype relaxed_dot_sext_1 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $1 +; CHECK-NEXT: return $pop0 + %sext1 = sext <16 x i8> %a to <16 x i16> + %sext2 = sext <16 x i8> %b to <16 x i16> + %mul = mul <16 x i16> %sext1, %sext2 + %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %res = add <8 x i16> %shuffle1, %shuffle2 + ret <8 x i16> %res +} + + +define <8 x i16> @relaxed_dot_sext_2(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: relaxed_dot_sext_2: +; CHECK: .functype relaxed_dot_sext_2 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $1 +; CHECK-NEXT: return $pop0 + %sext1 = sext <16 x i8> %a to <16 x i16> + %sext2 = sext <16 x i8> %b to <16 x i16> + %mul = mul <16 x i16> %sext1, %sext2 + %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %res = add <8 x i16> %shuffle2, %shuffle1 + ret <8 x i16> %res +} + +define <8 x i16> @relaxed_dot_sext_self(<16 x i8> %v) { +; CHECK-LABEL: relaxed_dot_sext_self: +; CHECK: .functype relaxed_dot_sext_self (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $0 +; CHECK-NEXT: return $pop0 + %sext = sext <16 x i8> %v to <16 x i16> + %mul = mul <16 x i16> %sext, %sext + %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %res = add <8 x i16> %shuffle1, %shuffle2 + ret <8 x i16> %res +} + +define <4 x i32> @relaxed_dot_add_from_relaxed_dot(<16 x i8> %a, <16 x i8> %b, <4 x i32> %c) { +; CHECK-LABEL: relaxed_dot_add_from_relaxed_dot: +; CHECK: .functype relaxed_dot_add_from_relaxed_dot (v128, v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s $push0=, $0, $1, $2 +; CHECK-NEXT: return $pop0 + %relaxed_dot_call = call <8 x i16> @llvm.wasm.relaxed.dot.i8x16.i7x16.signed(<16 x i8> %a, <16 x i8> %b) + %sext = call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> %relaxed_dot_call) + %res = add <4 x i32> %sext, %c + ret <4 x i32> %res +} + +; INFO: Negative test +define <8 x i16> @relaxed_dot_zext(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: relaxed_dot_zext: +; CHECK: .functype relaxed_dot_zext (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i16x8.extmul_low_i8x16_u $push6=, $0, $1 +; CHECK-NEXT: local.tee $push5=, $2=, $pop6 +; CHECK-NEXT: i16x8.extmul_high_i8x16_u $push4=, $0, $1 +; CHECK-NEXT: local.tee $push3=, $1=, $pop4 +; CHECK-NEXT: i8x16.shuffle $push1=, $pop5, $pop3, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 +; CHECK-NEXT: i8x16.shuffle $push0=, $2, $1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 +; CHECK-NEXT: i16x8.add $push2=, $pop1, $pop0 +; CHECK-NEXT: return $pop2 + %zext1 = zext <16 x i8> %a to <16 x i16> + %zext2 = zext <16 x i8> %b to <16 x i16> + %mul = mul <16 x i16> %zext1, %zext2 + %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %res = add <8 x i16> %shuffle1, %shuffle2 + ret <8 x i16> %res + +} + +; INFO: Negative test +define <8 x i16> @relaxed_dot_wrong_shuffle(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: relaxed_dot_wrong_shuffle: +; CHECK: .functype relaxed_dot_wrong_shuffle (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i16x8.extmul_low_i8x16_s $push1=, $0, $1 +; CHECK-NEXT: i16x8.extmul_high_i8x16_s $push0=, $0, $1 +; CHECK-NEXT: i16x8.add $push2=, $pop1, $pop0 +; CHECK-NEXT: return $pop2 + %sext1 = sext <16 x i8> %a to <16 x i16> + %sext2 = sext <16 x i8> %b to <16 x i16> + %mul = mul <16 x i16> %sext1, %sext2 + %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %res = add <8 x i16> %shuffle1, %shuffle2 + ret <8 x i16> %res +} diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index 5571519..c90344b8 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -502,11 +502,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2] ; SSE2-NEXT: psubw %xmm1, %xmm0 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,0,0] ; SSE2-NEXT: paddw %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0] ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,16,0,8,8,0,0,0,0,0,2,0,2,0,0,0] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -517,7 +517,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2] ; SSE41-NEXT: psubw %xmm1, %xmm0 -; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,0,0] ; SSE41-NEXT: paddw %xmm1, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4096,2048,8,u,u,2,2,u] ; SSE41-NEXT: pmulhuw %xmm0, %xmm1 @@ -530,7 +530,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2] ; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,0,0] ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4096,2048,8,u,u,2,2,u] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7] @@ -541,7 +541,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2] ; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,0,0] ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq @@ -630,7 +630,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [171,0,0,0] ; SSE2-NEXT: psrlw $15, %xmm0 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 @@ -641,7 +641,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [171,0,0,0] ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm2 ; SSE41-NEXT: psrlw $7, %xmm2 @@ -654,7 +654,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; AVX-LABEL: combine_vec_udiv_nonuniform4: ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [171,0,0,0] ; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1 @@ -691,7 +691,7 @@ define <8 x i16> @pr38477(<8 x i16> %a0) { ; SSE2-NEXT: psubw %xmm3, %xmm0 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,32768,0,0,0,0,0,32768] ; SSE2-NEXT: paddw %xmm3, %xmm0 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,u,0,4,0,4,16,0,4,0,0,4,0,0,0,16] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/cpus-intel.ll b/llvm/test/CodeGen/X86/cpus-intel.ll index 71253c8..646629d 100644 --- a/llvm/test/CodeGen/X86/cpus-intel.ll +++ b/llvm/test/CodeGen/X86/cpus-intel.ll @@ -39,6 +39,7 @@ ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=gracemont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pantherlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=wildcatlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=novalake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=clearwaterforest 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=diamondrapids 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty @@ -106,6 +107,7 @@ ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=gracemont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pantherlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=wildcatlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=novalake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=clearwaterforest 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=diamondrapids 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll index c2b7068..df04b67 100644 --- a/llvm/test/CodeGen/X86/isel-fpclass.ll +++ b/llvm/test/CodeGen/X86/isel-fpclass.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86,X86-SDAGISEL +; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL ; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X86-FASTISEL ; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X64,X64-FASTISEL -; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=1 | FileCheck %s -check-prefixes=X86,X86-GISEL -; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=1 | FileCheck %s -check-prefixes=X64-GISEL +; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=2 | FileCheck %s -check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=2 | FileCheck %s -check-prefixes=X64,X64-GISEL define i1 @isnone_f(float %x) nounwind { ; X86-LABEL: isnone_f: @@ -23,11 +23,6 @@ define i1 @isnone_f(float %x) nounwind { ; X86-FASTISEL-NEXT: fstp %st(0) ; X86-FASTISEL-NEXT: xorl %eax, %eax ; X86-FASTISEL-NEXT: retl -; -; X64-GISEL-LABEL: isnone_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: xorl %eax, %eax -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 0) ret i1 %0 @@ -50,27 +45,22 @@ define i1 @isany_f(float %x) nounwind { ; X86-FASTISEL-NEXT: fstp %st(0) ; X86-FASTISEL-NEXT: movb $1, %al ; X86-FASTISEL-NEXT: retl -; -; X64-GISEL-LABEL: isany_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movb $1, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1023) ret i1 %0 } define i1 @issignaling_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: issignaling_f: -; X86-SDAGISEL: # %bb.0: -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-SDAGISEL-NEXT: setl %cl -; X86-SDAGISEL-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; X86-SDAGISEL-NEXT: setge %al -; X86-SDAGISEL-NEXT: andb %cl, %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: issignaling_f: +; X86: # %bb.0: +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setl %cl +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %al +; X86-NEXT: andb %cl, %al +; X86-NEXT: retl ; ; X64-LABEL: issignaling_f: ; X64: # %bb.0: @@ -97,44 +87,18 @@ define i1 @issignaling_f(float %x) nounwind { ; X86-FASTISEL-NEXT: andb %cl, %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: issignaling_f: -; X86-GISEL: # %bb.0: -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: seta %dl -; X86-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-GISEL-NEXT: setb %al -; X86-GISEL-NEXT: andb %dl, %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: issignaling_f: -; X64-GISEL: # %bb.0: -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: seta %dl -; X64-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X64-GISEL-NEXT: setb %al -; X64-GISEL-NEXT: andb %dl, %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq %a0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1) ; "snan" ret i1 %a0 } define i1 @isquiet_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: isquiet_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-SDAGISEL-NEXT: setge %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: isquiet_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setge %al +; X86-NEXT: retl ; ; X64-LABEL: isquiet_f: ; X64: # %bb.0: # %entry @@ -155,39 +119,19 @@ define i1 @issignaling_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setge %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: isquiet_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-GISEL-NEXT: setae %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: isquiet_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X64-GISEL-NEXT: setae %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2) ; "qnan" ret i1 %0 } define i1 @not_isquiet_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: not_isquiet_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-SDAGISEL-NEXT: setl %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: not_isquiet_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setl %al +; X86-NEXT: retl ; ; X64-LABEL: not_isquiet_f: ; X64: # %bb.0: # %entry @@ -208,57 +152,19 @@ define i1 @not_isquiet_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setl %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: not_isquiet_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: setb %dl -; X86-GISEL-NEXT: orb %cl, %dl -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: sete %cl -; X86-GISEL-NEXT: orb %dl, %cl -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: seta %dl -; X86-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-GISEL-NEXT: setb %al -; X86-GISEL-NEXT: andb %dl, %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: not_isquiet_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: setb %dl -; X64-GISEL-NEXT: orb %cl, %dl -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: sete %cl -; X64-GISEL-NEXT: orb %dl, %cl -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: seta %dl -; X64-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X64-GISEL-NEXT: setb %al -; X64-GISEL-NEXT: andb %dl, %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021) ; ~"qnan" ret i1 %0 } define i1 @isinf_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: isinf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: sete %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: isinf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: retl ; ; X64-LABEL: isinf_f: ; X64: # %bb.0: # %entry @@ -279,39 +185,19 @@ define i1 @isinf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: sete %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: isinf_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: sete %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: isinf_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: sete %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516) ; 0x204 = "inf" ret i1 %0 } define i1 @not_isinf_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: not_isinf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: setne %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: not_isinf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setne %al +; X86-NEXT: retl ; ; X64-LABEL: not_isinf_f: ; X64: # %bb.0: # %entry @@ -332,43 +218,17 @@ define i1 @not_isinf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setne %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: not_isinf_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: setb %dl -; X86-GISEL-NEXT: orb %cl, %dl -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: seta %al -; X86-GISEL-NEXT: orb %dl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: not_isinf_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: setb %dl -; X64-GISEL-NEXT: orb %cl, %dl -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: seta %al -; X64-GISEL-NEXT: orb %dl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507) ; ~0x204 = "~inf" ret i1 %0 } define i1 @is_plus_inf_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: is_plus_inf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: sete %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: is_plus_inf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: retl ; ; X64-LABEL: is_plus_inf_f: ; X64: # %bb.0: # %entry @@ -386,34 +246,17 @@ define i1 @is_plus_inf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: sete %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: is_plus_inf_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; X86-GISEL-NEXT: sete %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: is_plus_inf_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: sete %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512) ; 0x200 = "+inf" ret i1 %0 } define i1 @is_minus_inf_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: is_minus_inf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 -; X86-SDAGISEL-NEXT: sete %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: is_minus_inf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 +; X86-NEXT: sete %al +; X86-NEXT: retl ; ; X64-LABEL: is_minus_inf_f: ; X64: # %bb.0: # %entry @@ -431,34 +274,17 @@ define i1 @is_minus_inf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: sete %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: is_minus_inf_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 -; X86-GISEL-NEXT: sete %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: is_minus_inf_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; X64-GISEL-NEXT: sete %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4) ; "-inf" ret i1 %0 } define i1 @not_is_minus_inf_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: not_is_minus_inf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 -; X86-SDAGISEL-NEXT: setne %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: not_is_minus_inf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 +; X86-NEXT: setne %al +; X86-NEXT: retl ; ; X64-LABEL: not_is_minus_inf_f: ; X64: # %bb.0: # %entry @@ -476,55 +302,19 @@ define i1 @not_is_minus_inf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setne %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: not_is_minus_inf_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: pushl %ebx -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: movl %eax, %ecx -; X86-GISEL-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %edx, %edx -; X86-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 -; X86-GISEL-NEXT: setb %bl -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: sete %ah -; X86-GISEL-NEXT: orb %dl, %ah -; X86-GISEL-NEXT: orb %bl, %ah -; X86-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 -; X86-GISEL-NEXT: seta %al -; X86-GISEL-NEXT: orb %ah, %al -; X86-GISEL-NEXT: popl %ebx -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: not_is_minus_inf_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: movl %eax, %ecx -; X64-GISEL-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %edx, %edx -; X64-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 -; X64-GISEL-NEXT: setb %sil -; X64-GISEL-NEXT: orb %dl, %sil -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: sete %dl -; X64-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 -; X64-GISEL-NEXT: seta %al -; X64-GISEL-NEXT: orb %dl, %al -; X64-GISEL-NEXT: orb %sil, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019) ; ~"-inf" ret i1 %0 } define i1 @isfinite_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: isfinite_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: setl %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: isfinite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setl %al +; X86-NEXT: retl ; ; X64-LABEL: isfinite_f: ; X64: # %bb.0: # %entry @@ -545,39 +335,19 @@ define i1 @isfinite_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setl %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: isfinite_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: setb %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: isfinite_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: setb %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504) ; 0x1f8 = "finite" ret i1 %0 } define i1 @not_isfinite_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: not_isfinite_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: setge %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: not_isfinite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setge %al +; X86-NEXT: retl ; ; X64-LABEL: not_isfinite_f: ; X64: # %bb.0: # %entry @@ -598,43 +368,17 @@ define i1 @not_isfinite_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setge %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: not_isfinite_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: sete %dl -; X86-GISEL-NEXT: orb %cl, %dl -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: seta %al -; X86-GISEL-NEXT: orb %dl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: not_isfinite_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: sete %dl -; X64-GISEL-NEXT: orb %cl, %dl -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: seta %al -; X64-GISEL-NEXT: orb %dl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519) ; ~0x1f8 = "~finite" ret i1 %0 } define i1 @is_plus_finite_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: is_plus_finite_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: setb %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: is_plus_finite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: setb %al +; X86-NEXT: retl ; ; X64-LABEL: is_plus_finite_f: ; X64: # %bb.0: # %entry @@ -652,23 +396,6 @@ define i1 @is_plus_finite_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setb %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: is_plus_finite_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; X86-GISEL-NEXT: setb %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: is_plus_finite_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: setb %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448) ; 0x1c0 = "+finite" ret i1 %0 @@ -691,11 +418,6 @@ define i1 @isnone_d(double %x) nounwind { ; X86-FASTISEL-NEXT: fstp %st(0) ; X86-FASTISEL-NEXT: xorl %eax, %eax ; X86-FASTISEL-NEXT: retl -; -; X64-GISEL-LABEL: isnone_d: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: xorl %eax, %eax -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 0) ret i1 %0 @@ -718,11 +440,6 @@ define i1 @isany_d(double %x) nounwind { ; X86-FASTISEL-NEXT: fstp %st(0) ; X86-FASTISEL-NEXT: movb $1, %al ; X86-FASTISEL-NEXT: retl -; -; X64-GISEL-LABEL: isany_d: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movb $1, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 1023) ret i1 %0 diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index bdb7c30..4ec54d8 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -2071,7 +2071,7 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) { ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,7,42,32] ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vphaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index d752659..04f0a65 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-NOVBMI ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-VBMI ; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 @@ -883,6 +883,30 @@ define <16 x i16> @test_16f32tosb_512(ptr %ptr, <16 x i16> %passthru) "min-legal } define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="256" { +; CHECK-SKX-NOVBMI-LABEL: mul256: +; CHECK-SKX-NOVBMI: # %bb.0: +; CHECK-SKX-NOVBMI-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-SKX-NOVBMI-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm3, %ymm5 +; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 +; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 +; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm1, %ymm1 +; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4) +; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm2, %ymm3 +; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 +; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm0, %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4) +; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-SKX-NOVBMI-NEXT: vzeroupper +; CHECK-SKX-NOVBMI-NEXT: retq +; ; CHECK-SKX-VBMI-LABEL: mul256: ; CHECK-SKX-VBMI: # %bb.0: ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm0 @@ -960,6 +984,21 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" } define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="512" { +; CHECK-SKX-NOVBMI-LABEL: mul512: +; CHECK-SKX-NOVBMI: # %bb.0: +; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rsi), %zmm1 +; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-SKX-NOVBMI-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 +; CHECK-SKX-NOVBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm1 +; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 +; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %zmm0, %zmm0 +; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2) +; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-SKX-NOVBMI-NEXT: vzeroupper +; CHECK-SKX-NOVBMI-NEXT: retq +; ; CHECK-SKX-VBMI-LABEL: mul512: ; CHECK-SKX-VBMI: # %bb.0: ; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 @@ -1137,6 +1176,14 @@ define <16 x i16> @trunc_v16i32_v16i16_zeroes(ptr %x) nounwind "min-legal-vector } define <32 x i8> @trunc_v32i16_v32i8_zeroes(ptr %x) nounwind "min-legal-vector-width"="256" { +; CHECK-SKX-NOVBMI-LABEL: trunc_v32i16_v32i8_zeroes: +; CHECK-SKX-NOVBMI: # %bb.0: +; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, 32(%rdi), %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, (%rdi), %ymm1 +; CHECK-SKX-NOVBMI-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-SKX-NOVBMI-NEXT: retq +; ; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_zeroes: ; CHECK-SKX-VBMI: # %bb.0: ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1 @@ -1192,6 +1239,14 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(ptr %x) nounwind "min-legal-vector-w } define <32 x i8> @trunc_v32i16_v32i8_sign(ptr %x) nounwind "min-legal-vector-width"="256" { +; CHECK-SKX-NOVBMI-LABEL: trunc_v32i16_v32i8_sign: +; CHECK-SKX-NOVBMI: # %bb.0: +; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, 32(%rdi), %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, (%rdi), %ymm1 +; CHECK-SKX-NOVBMI-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-SKX-NOVBMI-NEXT: retq +; ; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_sign: ; CHECK-SKX-VBMI: # %bb.0: ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1 diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll index cc4bda8..650b562 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X86 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X64 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,SSE2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=X64,SSE41 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,AVX1 diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll index 7c1a1e2..874d885 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X86 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X64 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,SSE2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=X64,SSE41 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,AVX1 diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll index 6174011..83a0ddb 100644 --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -5,9 +5,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI @@ -598,6 +598,33 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; +; AVX512F-LABEL: var_shuffle_zero_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 +; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514] +; AVX512F-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: var_shuffle_zero_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpcmpnleuw %zmm2, %zmm1, %k1 +; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514] +; AVX512BW-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; ; AVX512VL-LABEL: var_shuffle_zero_v8i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1 @@ -921,6 +948,28 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; +; AVX512F-LABEL: var_shuffle_zero_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: var_shuffle_zero_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpcmpnleub %zmm2, %zmm1, %k1 +; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; ; AVX512VL-LABEL: var_shuffle_zero_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 9b52857..d16b28a 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -1872,7 +1872,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: paddw %xmm0, %xmm0 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,8192,4096,2048,1024,512,256] @@ -1964,7 +1964,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 -; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2] ; X86-SSE2-NEXT: por %xmm1, %xmm2 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256] diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll index 983ae59..3d85d55 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -851,7 +851,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: paddw %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index d565ef0..1602cde 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1673,7 +1673,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,128,0,16,0,2,0,32,0,64,0,0,0,8,0,4] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2] @@ -1750,7 +1750,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] ; X86-SSE-NEXT: pandn %xmm0, %xmm1 -; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,128,0,16,0,2,0,32,0,64,0,0,0,8,0,4] ; X86-SSE-NEXT: por %xmm1, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2] diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 8cb2c7b..a847da6 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -1223,7 +1223,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1275,7 +1275,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] ; X86-SSE-NEXT: pandn %xmm0, %xmm1 -; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2] ; X86-SSE-NEXT: por %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index 57874c4..eb39b6a 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1480,7 +1480,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1532,7 +1532,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] ; X86-SSE-NEXT: pandn %xmm0, %xmm1 -; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u] ; X86-SSE-NEXT: por %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3> diff --git a/llvm/test/DebugInfo/AArch64/callsite.mir b/llvm/test/DebugInfo/AArch64/callsite.mir new file mode 100644 index 0000000..e3bd764 --- /dev/null +++ b/llvm/test/DebugInfo/AArch64/callsite.mir @@ -0,0 +1,68 @@ +# This test should not crash when generating call-site information. +# It was created to make sure that if isCopyLikeInstr in TargetInstrInfo.h +# returns an undef Dest Reg or Src Reg, we don't try to get a SubReg for it. + +# RUN: llc --mtriple=arm64e-apple-ios -start-before=aarch64-asm-printer %s -filetype=obj -o /dev/null --emit-call-site-info +--- | + %struct.rtyuio = type { i8 } + define noundef i32 @aserty(ptr noundef %0, ptr noundef %1) local_unnamed_addr #0 !dbg !23 { + ret i32 0 + } + define void @asdfgh(ptr noundef %0, ptr noundef %1, i8 noundef zeroext %2) local_unnamed_addr #0 !dbg !53 { + %4 = alloca ptr + %5 = call ptr @llvm.stackguard() + %6 = alloca %struct.rtyuio + %7 = icmp eq ptr %1, null + br i1 %7, label %10, label %8 + %9 = tail call i8 @polkiokl(ptr noundef %0) #6 + br label %10 + ret void + } + declare i8 @polkiokl(ptr noundef) local_unnamed_addr #2 + !llvm.module.flags = !{!2, !8} + !llvm.dbg.cu = !{!9} + !2 = !{i32 2, !"Debug Info Version", i32 3} + !8 = !{i32 7, !"frame-pointer", i32 1} + !9 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_11, file: !10, emissionKind: FullDebug, sysroot: "/") + !10 = !DIFile(filename: "a.cpp", directory: "/") + !23 = distinct !DISubprogram(type: !27, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, unit: !9, retainedNodes: !46) + !24 = distinct !DICompositeType(tag: DW_TAG_class_type, identifier: "yshscbshhdvcm") + !27 = !DISubroutineType(types: !28) + !28 = !{} + !30 = !DIDerivedType(tag: DW_TAG_typedef, baseType: !33) + !33 = distinct !DICompositeType(tag: DW_TAG_structure_type, identifier: "tyruwyeuiwiybabd") + !36 = !DISubroutineType(types: !37) + !37 = !{} + !46 = !{} + !47 = !DILocalVariable(scope: !23, type: !48, flags: DIFlagArtificial | DIFlagObjectPointer) + !48 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !24, size: 64) + !49 = !DILocalVariable(scope: !23, type: !30) + !50 = !DILocation(scope: !23) + !51 = !DILocation(scope: !23) + !53 = distinct !DISubprogram(type: !36, unit: !9, retainedNodes: !54) + !54 = !{} +name: aserty +stack: + - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: + - { bb: 0, offset: 9, fwdArgRegs: + - { arg: 2, reg: '$w2' } } +body: | + bb.0 (%ir-block.2): + DBG_VALUE $x0, $noreg, !47, !DIExpression(), debug-location !50 + DBG_VALUE $x1, $noreg, !49, !DIExpression(), debug-location !50 + frame-setup PACIBSP implicit-def $lr, implicit killed $lr, implicit $sp + early-clobber $sp = frame-setup STPXpre $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.1), (store (s64) into %stack.0) + $fp = frame-setup ADDXri $sp, 0, 0 + frame-setup CFI_INSTRUCTION def_cfa $w29, 16 + frame-setup CFI_INSTRUCTION offset $w30, -8 + frame-setup CFI_INSTRUCTION offset $w29, -16 + $x2 = ORRXrs $xzr, undef $noreg, 0, implicit $wzr, debug-location !51 + BL @asdfgh, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit killed $x1, implicit killed $w2, implicit-def $sp, debug-location !51 +... +name: asdfgh +body: | + bb.2 (%ir-block.10): diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s b/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s index 73653d0..6345b2f 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 -// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX1250,GFX1250-ASM %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250,GFX1250-DIS %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX1250 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s s_mov_b64 s[2:3], 0x10abcdef12345678 // GFX1250: s_mov_b64 s[2:3], 0x10abcdef12345678 ; encoding: [0xfe,0x01,0x82,0xbe,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] @@ -62,10 +62,8 @@ s_cselect_b64 s[2:3], s[4:5], 0x10abcdef12345678 s_mov_b64 s[2:3], 0xffffffff01234567 // GFX1250: s_mov_b64 s[2:3], 0xffffffff01234567 ; encoding: [0xfe,0x01,0x82,0xbe,0x67,0x45,0x23,0x01,0xff,0xff,0xff,0xff] -// TODO: disasm s_mov_b64 s[2:3], lit64(0x777) -// GFX1250-ASM: s_mov_b64 s[2:3], lit64(0x777) ; encoding: [0xfe,0x01,0x82,0xbe,0x77,0x07,0x00,0x00,0x00,0x00,0x00,0x00] -// GFX1250-DIS: s_mov_b64 s[2:3], 0x777 ; encoding: [0xff,0x01,0x82,0xbe,0x77,0x07,0x00,0x00] +// GFX1250: s_mov_b64 s[2:3], lit64(0x777) ; encoding: [0xfe,0x01,0x82,0xbe,0x77,0x07,0x00,0x00,0x00,0x00,0x00,0x00] s_mov_b64 s[2:3], 0x777 // GFX1250: s_mov_b64 s[2:3], 0x777 ; encoding: [0xff,0x01,0x82,0xbe,0x77,0x07,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s index 0d61c1f..39de9a2 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250,GFX1250-ASM %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250,GFX1250-DIS %s v_mov_b64_e32 v[4:5], v[2:3] // GFX1250: v_mov_b64_e32 v[4:5], v[2:3] ; encoding: [0x02,0x3b,0x08,0x7e] @@ -26,8 +26,10 @@ v_mov_b64 v[4:5], -1 v_mov_b64 v[4:5], 0.5 // GFX1250: v_mov_b64_e32 v[4:5], 0.5 ; encoding: [0xf0,0x3a,0x08,0x7e] +// TODO: Encode as a 32-bit literal unless lit64() is specified. v_mov_b64 v[254:255], 0xaf123456 -// GFX1250: v_mov_b64_e32 v[254:255], 0xaf123456 ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: v_mov_b64_e32 v[254:255], 0xaf123456 ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] v_tanh_f32 v5, v1 // GFX1250: v_tanh_f32_e32 v5, v1 ; encoding: [0x01,0x3d,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s index 02872b0..d9f6934 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s @@ -196,8 +196,9 @@ v_add_nc_u64 v[4:5], -4.0, v[4:5] // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU v_add_nc_u64 v[4:5], 0xaf123456, v[4:5] -// GFX1250: v_add_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] -// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// GFX1250-ASM: v_add_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: v_add_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1200-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU v_add_nc_u64 v[4:5], 0x3f717273, v[4:5] // GFX1250: v_add_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f] @@ -316,8 +317,9 @@ v_sub_nc_u64 v[4:5], -4.0, v[4:5] // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU v_sub_nc_u64 v[4:5], 0xaf123456, v[4:5] -// GFX1250: v_sub_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] -// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// GFX1250-ASM: v_sub_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: v_sub_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1200-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU v_sub_nc_u64 v[4:5], 0x3f717273, v[4:5] // GFX1250: v_sub_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f] @@ -436,8 +438,9 @@ v_mul_u64 v[4:5], -4.0, v[4:5] // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU v_mul_u64 v[4:5], 0xaf123456, v[4:5] -// GFX1250: v_mul_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] -// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// GFX1250-ASM: v_mul_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: v_mul_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1200-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU v_mul_u64 v[4:5], 0x3f717273, v[4:5] // GFX1250: v_mul_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s index ad5771b..0548e9d 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefixes=GFX12,GFX1200 %s -// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250 %s -// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250,GFX1250-ASM %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250,GFX1250-DIS %s s_alloc_vgpr 0x1235 // GFX12: s_alloc_vgpr 0x1235 ; encoding: [0xff,0x53,0x80,0xbe,0x35,0x12,0x00,0x00] @@ -860,7 +860,8 @@ s_mov_b64 s[0:1], 0x3f717273 s_mov_b64 s[0:1], 0xaf123456 // GFX1200: s_mov_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_mov_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_mov_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_mov_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_mov_b64 s[0:1], null // GFX12: s_mov_b64 s[0:1], null ; encoding: [0x7c,0x01,0x80,0xbe] @@ -969,7 +970,8 @@ s_cmov_b64 s[0:1], 0x3f717273 s_cmov_b64 s[0:1], 0xaf123456 // GFX1200: s_cmov_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_cmov_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_cmov_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_cmov_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_not_b32 s0, s1 // GFX12: s_not_b32 s0, s1 ; encoding: [0x01,0x1e,0x80,0xbe] @@ -1072,7 +1074,8 @@ s_not_b64 s[0:1], 0x3f717273 s_not_b64 s[0:1], 0xaf123456 // GFX1200: s_not_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_not_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_not_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_not_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_wqm_b32 s0, s1 // GFX12: s_wqm_b32 s0, s1 ; encoding: [0x01,0x1c,0x80,0xbe] @@ -1175,7 +1178,8 @@ s_wqm_b64 s[0:1], 0x3f717273 s_wqm_b64 s[0:1], 0xaf123456 // GFX1200: s_wqm_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_wqm_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_wqm_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_wqm_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_brev_b32 s0, s1 // GFX12: s_brev_b32 s0, s1 ; encoding: [0x01,0x04,0x80,0xbe] @@ -1278,7 +1282,8 @@ s_brev_b64 s[0:1], 0x3f717273 s_brev_b64 s[0:1], 0xaf123456 // GFX1200: s_brev_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_brev_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_brev_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_brev_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_bcnt0_i32_b32 s0, s1 // GFX12: s_bcnt0_i32_b32 s0, s1 ; encoding: [0x01,0x16,0x80,0xbe] @@ -1390,7 +1395,8 @@ s_bcnt0_i32_b64 s0, 0x3f717273 s_bcnt0_i32_b64 s0, 0xaf123456 // GFX1200: s_bcnt0_i32_b64 s0, 0xaf123456 ; encoding: [0xff,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_bcnt0_i32_b64 s0, 0xaf123456 ; encoding: [0xfe,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_bcnt0_i32_b64 s0, 0xaf123456 ; encoding: [0xfe,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_bcnt0_i32_b64 s0, lit64(0xaf123456) ; encoding: [0xfe,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_bcnt1_i32_b32 s0, s1 // GFX12: s_bcnt1_i32_b32 s0, s1 ; encoding: [0x01,0x18,0x80,0xbe] @@ -1502,7 +1508,8 @@ s_bcnt1_i32_b64 s0, 0x3f717273 s_bcnt1_i32_b64 s0, 0xaf123456 // GFX1200: s_bcnt1_i32_b64 s0, 0xaf123456 ; encoding: [0xff,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_bcnt1_i32_b64 s0, 0xaf123456 ; encoding: [0xfe,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_bcnt1_i32_b64 s0, 0xaf123456 ; encoding: [0xfe,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_bcnt1_i32_b64 s0, lit64(0xaf123456) ; encoding: [0xfe,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_ff1_i32_b32 s0, s1 // GFX12: s_ctz_i32_b32 s0, s1 ; encoding: [0x01,0x08,0x80,0xbe] @@ -1614,7 +1621,8 @@ s_ff1_i32_b64 s0, 0x3f717273 s_ff1_i32_b64 s0, 0xaf123456 // GFX1200: s_ctz_i32_b64 s0, 0xaf123456 ; encoding: [0xff,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_ctz_i32_b64 s0, 0xaf123456 ; encoding: [0xfe,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_ctz_i32_b64 s0, 0xaf123456 ; encoding: [0xfe,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_ctz_i32_b64 s0, lit64(0xaf123456) ; encoding: [0xfe,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_flbit_i32_b32 s0, s1 // GFX12: s_clz_i32_u32 s0, s1 ; encoding: [0x01,0x0a,0x80,0xbe] @@ -1726,7 +1734,8 @@ s_flbit_i32_b64 s0, 0x3f717273 s_flbit_i32_b64 s0, 0xaf123456 // GFX1200: s_clz_i32_u64 s0, 0xaf123456 ; encoding: [0xff,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_clz_i32_u64 s0, 0xaf123456 ; encoding: [0xfe,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_clz_i32_u64 s0, 0xaf123456 ; encoding: [0xfe,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_clz_i32_u64 s0, lit64(0xaf123456) ; encoding: [0xfe,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_flbit_i32 s0, s1 // GFX12: s_cls_i32 s0, s1 ; encoding: [0x01,0x0c,0x80,0xbe] @@ -1838,7 +1847,8 @@ s_flbit_i32_i64 s0, 0x3f717273 s_flbit_i32_i64 s0, 0xaf123456 // GFX1200: s_cls_i32_i64 s0, 0xaf123456 ; encoding: [0xff,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_cls_i32_i64 s0, 0xaf123456 ; encoding: [0xfe,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_cls_i32_i64 s0, 0xaf123456 ; encoding: [0xfe,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_cls_i32_i64 s0, lit64(0xaf123456) ; encoding: [0xfe,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_sext_i32_i8 s0, s1 // GFX12: s_sext_i32_i8 s0, s1 ; encoding: [0x01,0x0e,0x80,0xbe] @@ -2284,7 +2294,8 @@ s_and_saveexec_b64 s[0:1], 0x3f717273 s_and_saveexec_b64 s[0:1], 0xaf123456 // GFX1200: s_and_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_or_saveexec_b64 s[0:1], s[2:3] // GFX12: s_or_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x23,0x80,0xbe] @@ -2324,7 +2335,8 @@ s_or_saveexec_b64 s[0:1], 0x3f717273 s_or_saveexec_b64 s[0:1], 0xaf123456 // GFX1200: s_or_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_or_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_or_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_or_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_xor_saveexec_b64 s[0:1], s[2:3] // GFX12: s_xor_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x25,0x80,0xbe] @@ -2364,7 +2376,8 @@ s_xor_saveexec_b64 s[0:1], 0x3f717273 s_xor_saveexec_b64 s[0:1], 0xaf123456 // GFX1200: s_xor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_xor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_xor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_xor_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_andn2_saveexec_b64 s[0:1], s[2:3] // GFX12: s_and_not1_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x31,0x80,0xbe] @@ -2404,7 +2417,8 @@ s_andn2_saveexec_b64 s[0:1], 0x3f717273 s_andn2_saveexec_b64 s[0:1], 0xaf123456 // GFX1200: s_and_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_not1_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_orn2_saveexec_b64 s[0:1], s[2:3] // GFX12: s_or_not1_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x33,0x80,0xbe] @@ -2444,7 +2458,8 @@ s_orn2_saveexec_b64 s[0:1], 0x3f717273 s_orn2_saveexec_b64 s[0:1], 0xaf123456 // GFX1200: s_or_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_or_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_or_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_or_not1_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_nand_saveexec_b64 s[0:1], s[2:3] // GFX12: s_nand_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x27,0x80,0xbe] @@ -2484,7 +2499,8 @@ s_nand_saveexec_b64 s[0:1], 0x3f717273 s_nand_saveexec_b64 s[0:1], 0xaf123456 // GFX1200: s_nand_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_nand_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_nand_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_nand_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_nor_saveexec_b64 s[0:1], s[2:3] // GFX12: s_nor_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x29,0x80,0xbe] @@ -2524,7 +2540,8 @@ s_nor_saveexec_b64 s[0:1], 0x3f717273 s_nor_saveexec_b64 s[0:1], 0xaf123456 // GFX1200: s_nor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_nor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_nor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_nor_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_xnor_saveexec_b64 s[0:1], s[2:3] // GFX12: s_xnor_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x2b,0x80,0xbe] @@ -2564,7 +2581,8 @@ s_xnor_saveexec_b64 s[0:1], 0x3f717273 s_xnor_saveexec_b64 s[0:1], 0xaf123456 // GFX1200: s_xnor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_xnor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_xnor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_xnor_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_quadmask_b32 s0, s1 // GFX12: s_quadmask_b32 s0, s1 ; encoding: [0x01,0x1a,0x80,0xbe] @@ -2667,7 +2685,8 @@ s_quadmask_b64 s[0:1], 0x3f717273 s_quadmask_b64 s[0:1], 0xaf123456 // GFX1200: s_quadmask_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_quadmask_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_quadmask_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_quadmask_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_movrels_b32 s0, s1 // GFX12: s_movrels_b32 s0, s1 ; encoding: [0x01,0x40,0x80,0xbe] @@ -2812,7 +2831,8 @@ s_movreld_b64 s[0:1], 0x3f717273 s_movreld_b64 s[0:1], 0xaf123456 // GFX1200: s_movreld_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_movreld_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_movreld_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_movreld_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_abs_i32 s0, s1 // GFX12: s_abs_i32 s0, s1 ; encoding: [0x01,0x15,0x80,0xbe] @@ -2912,7 +2932,8 @@ s_andn1_saveexec_b64 s[0:1], 0x3f717273 s_andn1_saveexec_b64 s[0:1], 0xaf123456 // GFX1200: s_and_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_not0_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_orn1_saveexec_b64 s[0:1], s[2:3] // GFX12: s_or_not0_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x2f,0x80,0xbe] @@ -2952,7 +2973,8 @@ s_orn1_saveexec_b64 s[0:1], 0x3f717273 s_orn1_saveexec_b64 s[0:1], 0xaf123456 // GFX1200: s_or_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_or_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_or_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_or_not0_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_andn1_wrexec_b64 s[0:1], s[2:3] // GFX12: s_and_not0_wrexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x35,0x80,0xbe] @@ -2992,7 +3014,8 @@ s_andn1_wrexec_b64 s[0:1], 0x3f717273 s_andn1_wrexec_b64 s[0:1], 0xaf123456 // GFX1200: s_and_not0_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_not0_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_not0_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_not0_wrexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_andn2_wrexec_b64 s[0:1], s[2:3] // GFX12: s_and_not1_wrexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x37,0x80,0xbe] @@ -3032,7 +3055,8 @@ s_andn2_wrexec_b64 s[0:1], 0x3f717273 s_andn2_wrexec_b64 s[0:1], 0xaf123456 // GFX1200: s_and_not1_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_not1_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_not1_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_not1_wrexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_bitreplicate_b64_b32 s[0:1], s2 // GFX12: s_bitreplicate_b64_b32 s[0:1], s2 ; encoding: [0x02,0x14,0x80,0xbe] @@ -3831,7 +3855,8 @@ s_ctz_i32_b64 exec_hi, src_scc s_ctz_i32_b64 null, 0xaf123456 // GFX1200: s_ctz_i32_b64 null, 0xaf123456 ; encoding: [0xff,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_ctz_i32_b64 null, 0xaf123456 ; encoding: [0xfe,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_ctz_i32_b64 null, 0xaf123456 ; encoding: [0xfe,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_ctz_i32_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_and_not1_saveexec_b64 s[10:11], s[2:3] // GFX12: s_and_not1_saveexec_b64 s[10:11], s[2:3] ; encoding: [0x02,0x31,0x8a,0xbe] @@ -3859,7 +3884,8 @@ s_and_not1_saveexec_b64 ttmp[14:15], src_scc s_and_not1_saveexec_b64 null, 0xaf123456 // GFX1200: s_and_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_not1_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_and_not0_saveexec_b32 s5, s1 // GFX12: s_and_not0_saveexec_b32 s5, s1 ; encoding: [0x01,0x2c,0x85,0xbe] @@ -3920,7 +3946,8 @@ s_and_not0_saveexec_b64 ttmp[14:15], src_scc s_and_not0_saveexec_b64 null, 0xaf123456 // GFX1200: s_and_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_not0_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_and_not0_wrexec_b32 s5, s1 // GFX12: s_and_not0_wrexec_b32 s5, s1 ; encoding: [0x01,0x34,0x85,0xbe] @@ -3981,7 +4008,8 @@ s_and_not0_wrexec_b64 ttmp[14:15], src_scc s_and_not0_wrexec_b64 null, 0xaf123456 // GFX1200: s_and_not0_wrexec_b64 null, 0xaf123456 ; encoding: [0xff,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_not0_wrexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_not0_wrexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_not0_wrexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_and_not1_saveexec_b32 s5, s1 // GFX12: s_and_not1_saveexec_b32 s5, s1 ; encoding: [0x01,0x30,0x85,0xbe] @@ -4075,7 +4103,8 @@ s_and_not1_wrexec_b64 ttmp[14:15], src_scc s_and_not1_wrexec_b64 null, 0xaf123456 // GFX1200: s_and_not1_wrexec_b64 null, 0xaf123456 ; encoding: [0xff,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_not1_wrexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_not1_wrexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_not1_wrexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_cls_i32 s5, s1 // GFX12: s_cls_i32 s5, s1 ; encoding: [0x01,0x0c,0x85,0xbe] @@ -4145,7 +4174,8 @@ s_cls_i32_i64 exec_hi, src_scc s_cls_i32_i64 null, 0xaf123456 // GFX1200: s_cls_i32_i64 null, 0xaf123456 ; encoding: [0xff,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_cls_i32_i64 null, 0xaf123456 ; encoding: [0xfe,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_cls_i32_i64 null, 0xaf123456 ; encoding: [0xfe,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_cls_i32_i64 null, lit64(0xaf123456) ; encoding: [0xfe,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_clz_i32_u32 s5, s1 // GFX12: s_clz_i32_u32 s5, s1 ; encoding: [0x01,0x0a,0x85,0xbe] @@ -4215,7 +4245,8 @@ s_clz_i32_u64 exec_hi, src_scc s_clz_i32_u64 null, 0xaf123456 // GFX1200: s_clz_i32_u64 null, 0xaf123456 ; encoding: [0xff,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_clz_i32_u64 null, 0xaf123456 ; encoding: [0xfe,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_clz_i32_u64 null, 0xaf123456 ; encoding: [0xfe,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_clz_i32_u64 null, lit64(0xaf123456) ; encoding: [0xfe,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_or_not0_saveexec_b32 s5, s1 // GFX12: s_or_not0_saveexec_b32 s5, s1 ; encoding: [0x01,0x2e,0x85,0xbe] @@ -4276,7 +4307,8 @@ s_or_not0_saveexec_b64 ttmp[14:15], src_scc s_or_not0_saveexec_b64 null, 0xaf123456 // GFX1200: s_or_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_or_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_or_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_or_not0_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_or_not1_saveexec_b32 s5, s1 // GFX12: s_or_not1_saveexec_b32 s5, s1 ; encoding: [0x01,0x32,0x85,0xbe] @@ -4337,4 +4369,5 @@ s_or_not1_saveexec_b64 ttmp[14:15], src_scc s_or_not1_saveexec_b64 null, 0xaf123456 // GFX1200: s_or_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf] -// GFX1250: s_or_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_or_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_or_not1_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s index 9c83879..3a24442 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefixes=GFX12,GFX1200 %s -// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250 %s -// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250-ASM %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX12,GFX1250-DIS %s s_add_nc_u64 s[0:1], s[2:3], s[4:5] // GFX12: s_add_nc_u64 s[0:1], s[2:3], s[4:5] ; encoding: [0x02,0x04,0x80,0xa9] @@ -56,7 +56,8 @@ s_add_nc_u64 s[0:1], 0x3f717273, s[2:3] s_add_nc_u64 s[0:1], 0xaf123456, s[2:3] // GFX1200: s_add_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xff,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf] -// GFX1250: s_add_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_add_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_add_nc_u64 s[0:1], lit64(0xaf123456), s[2:3] ; encoding: [0xfe,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_add_nc_u64 s[0:1], s[2:3], exec // GFX12: s_add_nc_u64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0xa9] @@ -81,7 +82,8 @@ s_add_nc_u64 s[0:1], s[2:3], 0x3f717273 s_add_nc_u64 s[0:1], s[2:3], 0xaf123456 // GFX1200: s_add_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0xa9,0x56,0x34,0x12,0xaf] -// GFX1250: s_add_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_add_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_add_nc_u64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_sub_nc_u64 s[0:1], s[2:3], s[4:5] // GFX12: s_sub_nc_u64 s[0:1], s[2:3], s[4:5] ; encoding: [0x02,0x04,0x00,0xaa] @@ -136,7 +138,8 @@ s_sub_nc_u64 s[0:1], 0x3f717273, s[2:3] s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3] // GFX1200: s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xff,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf] -// GFX1250: s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_sub_nc_u64 s[0:1], lit64(0xaf123456), s[2:3] ; encoding: [0xfe,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_sub_nc_u64 s[0:1], s[2:3], exec // GFX12: s_sub_nc_u64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x00,0xaa] @@ -161,7 +164,8 @@ s_sub_nc_u64 s[0:1], s[2:3], 0x3f717273 s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456 // GFX1200: s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x00,0xaa,0x56,0x34,0x12,0xaf] -// GFX1250: s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_sub_nc_u64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_mul_u64 s[0:1], s[2:3], s[4:5] // GFX12: s_mul_u64 s[0:1], s[2:3], s[4:5] ; encoding: [0x02,0x04,0x80,0xaa] @@ -216,7 +220,8 @@ s_mul_u64 s[0:1], 0x3f717273, s[2:3] s_mul_u64 s[0:1], 0xaf123456, s[2:3] // GFX1200: s_mul_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xff,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf] -// GFX1250: s_mul_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_mul_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_mul_u64 s[0:1], lit64(0xaf123456), s[2:3] ; encoding: [0xfe,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_mul_u64 s[0:1], s[2:3], exec // GFX12: s_mul_u64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0xaa] @@ -241,7 +246,8 @@ s_mul_u64 s[0:1], s[2:3], 0x3f717273 s_mul_u64 s[0:1], s[2:3], 0xaf123456 // GFX1200: s_mul_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0xaa,0x56,0x34,0x12,0xaf] -// GFX1250: s_mul_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_mul_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_mul_u64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_add_f32 s5, s1, s2 // GFX12: s_add_f32 s5, s1, s2 ; encoding: [0x01,0x02,0x05,0xa0] @@ -2359,7 +2365,8 @@ s_cselect_b64 s[0:1], 0x3f717273, s[4:5] s_cselect_b64 s[0:1], 0xaf123456, s[4:5] // GFX1200: s_cselect_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x98,0x56,0x34,0x12,0xaf] -// GFX1250: s_cselect_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_cselect_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_cselect_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_cselect_b64 s[0:1], s[2:3], exec // GFX12: s_cselect_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x98] @@ -2384,7 +2391,8 @@ s_cselect_b64 s[0:1], s[2:3], 0x3f717273 s_cselect_b64 s[0:1], s[2:3], 0xaf123456 // GFX1200: s_cselect_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x98,0x56,0x34,0x12,0xaf] -// GFX1250: s_cselect_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_cselect_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_cselect_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_and_b32 s0, s1, s2 // GFX12: s_and_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x8b] @@ -2553,7 +2561,8 @@ s_and_b64 s[0:1], 0x3f717273, s[4:5] s_and_b64 s[0:1], 0xaf123456, s[4:5] // GFX1200: s_and_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_and_b64 s[0:1], s[2:3], exec // GFX12: s_and_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x8b] @@ -2578,7 +2587,8 @@ s_and_b64 s[0:1], s[2:3], 0x3f717273 s_and_b64 s[0:1], s[2:3], 0xaf123456 // GFX1200: s_and_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x8b,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_or_b32 s0, s1, s2 // GFX12: s_or_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x8c] @@ -2738,7 +2748,8 @@ s_or_b64 s[0:1], 0x3f717273, s[4:5] s_or_b64 s[0:1], 0xaf123456, s[4:5] // GFX1200: s_or_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf] -// GFX1250: s_or_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_or_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_or_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_or_b64 s[0:1], s[2:3], exec // GFX12: s_or_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x8c] @@ -2763,7 +2774,8 @@ s_or_b64 s[0:1], s[2:3], 0x3f717273 s_or_b64 s[0:1], s[2:3], 0xaf123456 // GFX1200: s_or_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x8c,0x56,0x34,0x12,0xaf] -// GFX1250: s_or_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_or_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_or_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_xor_b32 s0, s1, s2 // GFX12: s_xor_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x8d] @@ -2923,7 +2935,8 @@ s_xor_b64 s[0:1], 0x3f717273, s[4:5] s_xor_b64 s[0:1], 0xaf123456, s[4:5] // GFX1200: s_xor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf] -// GFX1250: s_xor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_xor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_xor_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_xor_b64 s[0:1], s[2:3], exec // GFX12: s_xor_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x8d] @@ -2948,7 +2961,8 @@ s_xor_b64 s[0:1], s[2:3], 0x3f717273 s_xor_b64 s[0:1], s[2:3], 0xaf123456 // GFX1200: s_xor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x8d,0x56,0x34,0x12,0xaf] -// GFX1250: s_xor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_xor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_xor_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_andn2_b32 s0, s1, s2 // GFX12: s_and_not1_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x91] @@ -3108,7 +3122,8 @@ s_andn2_b64 s[0:1], 0x3f717273, s[4:5] s_andn2_b64 s[0:1], 0xaf123456, s[4:5] // GFX1200: s_and_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x91,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_not1_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_andn2_b64 s[0:1], s[2:3], exec // GFX12: s_and_not1_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x91] @@ -3133,7 +3148,8 @@ s_andn2_b64 s[0:1], s[2:3], 0x3f717273 s_andn2_b64 s[0:1], s[2:3], 0xaf123456 // GFX1200: s_and_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x91,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_not1_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_orn2_b32 s0, s1, s2 // GFX12: s_or_not1_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x92] @@ -3293,7 +3309,8 @@ s_orn2_b64 s[0:1], 0x3f717273, s[4:5] s_orn2_b64 s[0:1], 0xaf123456, s[4:5] // GFX1200: s_or_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x92,0x56,0x34,0x12,0xaf] -// GFX1250: s_or_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_or_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_or_not1_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_orn2_b64 s[0:1], s[2:3], exec // GFX12: s_or_not1_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x92] @@ -3318,7 +3335,8 @@ s_orn2_b64 s[0:1], s[2:3], 0x3f717273 s_orn2_b64 s[0:1], s[2:3], 0xaf123456 // GFX1200: s_or_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x92,0x56,0x34,0x12,0xaf] -// GFX1250: s_or_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_or_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_or_not1_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_nand_b32 s0, s1, s2 // GFX12: s_nand_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x8e] @@ -3478,7 +3496,8 @@ s_nand_b64 s[0:1], 0x3f717273, s[4:5] s_nand_b64 s[0:1], 0xaf123456, s[4:5] // GFX1200: s_nand_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf] -// GFX1250: s_nand_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_nand_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_nand_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_nand_b64 s[0:1], s[2:3], exec // GFX12: s_nand_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x8e] @@ -3503,7 +3522,8 @@ s_nand_b64 s[0:1], s[2:3], 0x3f717273 s_nand_b64 s[0:1], s[2:3], 0xaf123456 // GFX1200: s_nand_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x8e,0x56,0x34,0x12,0xaf] -// GFX1250: s_nand_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_nand_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_nand_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_nor_b32 s0, s1, s2 // GFX12: s_nor_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x8f] @@ -3663,7 +3683,8 @@ s_nor_b64 s[0:1], 0x3f717273, s[4:5] s_nor_b64 s[0:1], 0xaf123456, s[4:5] // GFX1200: s_nor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf] -// GFX1250: s_nor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_nor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_nor_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_nor_b64 s[0:1], s[2:3], exec // GFX12: s_nor_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x8f] @@ -3688,7 +3709,8 @@ s_nor_b64 s[0:1], s[2:3], 0x3f717273 s_nor_b64 s[0:1], s[2:3], 0xaf123456 // GFX1200: s_nor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x8f,0x56,0x34,0x12,0xaf] -// GFX1250: s_nor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_nor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_nor_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_xnor_b32 s0, s1, s2 // GFX12: s_xnor_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x90] @@ -3848,7 +3870,8 @@ s_xnor_b64 s[0:1], 0x3f717273, s[4:5] s_xnor_b64 s[0:1], 0xaf123456, s[4:5] // GFX1200: s_xnor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x90,0x56,0x34,0x12,0xaf] -// GFX1250: s_xnor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_xnor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_xnor_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_xnor_b64 s[0:1], s[2:3], exec // GFX12: s_xnor_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x90] @@ -3873,7 +3896,8 @@ s_xnor_b64 s[0:1], s[2:3], 0x3f717273 s_xnor_b64 s[0:1], s[2:3], 0xaf123456 // GFX1200: s_xnor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x90,0x56,0x34,0x12,0xaf] -// GFX1250: s_xnor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_xnor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_xnor_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_lshl_b32 s0, s1, s2 // GFX12: s_lshl_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x84] @@ -4033,7 +4057,8 @@ s_lshl_b64 s[0:1], 0x3f717273, s4 s_lshl_b64 s[0:1], 0xaf123456, s4 // GFX1200: s_lshl_b64 s[0:1], 0xaf123456, s4 ; encoding: [0xff,0x04,0x80,0x84,0x56,0x34,0x12,0xaf] -// GFX1250: s_lshl_b64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x84,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_lshl_b64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x84,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_lshl_b64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x84,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_lshl_b64 s[0:1], s[2:3], exec_lo // GFX12: s_lshl_b64 s[0:1], s[2:3], exec_lo ; encoding: [0x02,0x7e,0x80,0x84] @@ -4217,7 +4242,8 @@ s_lshr_b64 s[0:1], 0x3f717273, s4 s_lshr_b64 s[0:1], 0xaf123456, s4 // GFX1200: s_lshr_b64 s[0:1], 0xaf123456, s4 ; encoding: [0xff,0x04,0x80,0x85,0x56,0x34,0x12,0xaf] -// GFX1250: s_lshr_b64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x85,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_lshr_b64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x85,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_lshr_b64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x85,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_lshr_b64 s[0:1], s[2:3], exec_lo // GFX12: s_lshr_b64 s[0:1], s[2:3], exec_lo ; encoding: [0x02,0x7e,0x80,0x85] @@ -4401,7 +4427,8 @@ s_ashr_i64 s[0:1], 0x3f717273, s4 s_ashr_i64 s[0:1], 0xaf123456, s4 // GFX1200: s_ashr_i64 s[0:1], 0xaf123456, s4 ; encoding: [0xff,0x04,0x80,0x86,0x56,0x34,0x12,0xaf] -// GFX1250: s_ashr_i64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x86,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_ashr_i64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x86,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_ashr_i64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x86,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_ashr_i64 s[0:1], s[2:3], exec_lo // GFX12: s_ashr_i64 s[0:1], s[2:3], exec_lo ; encoding: [0x02,0x7e,0x80,0x86] @@ -4996,7 +5023,8 @@ s_bfe_u64 s[0:1], 0x3f717273, s4 s_bfe_u64 s[0:1], 0xaf123456, s4 // GFX1200: s_bfe_u64 s[0:1], 0xaf123456, s4 ; encoding: [0xff,0x04,0x00,0x94,0x56,0x34,0x12,0xaf] -// GFX1250: s_bfe_u64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x00,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_bfe_u64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x00,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_bfe_u64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x00,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_bfe_u64 s[0:1], s[2:3], exec_lo // GFX12: s_bfe_u64 s[0:1], s[2:3], exec_lo ; encoding: [0x02,0x7e,0x00,0x94] @@ -5075,7 +5103,8 @@ s_bfe_i64 s[0:1], 0x3f717273, s4 s_bfe_i64 s[0:1], 0xaf123456, s4 // GFX1200: s_bfe_i64 s[0:1], 0xaf123456, s4 ; encoding: [0xff,0x04,0x80,0x94,0x56,0x34,0x12,0xaf] -// GFX1250: s_bfe_i64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_bfe_i64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_bfe_i64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_bfe_i64 s[0:1], s[2:3], exec_lo // GFX12: s_bfe_i64 s[0:1], s[2:3], exec_lo ; encoding: [0x02,0x7e,0x80,0x94] @@ -6279,7 +6308,8 @@ s_and_not1_b64 s[10:11], vcc, ttmp[14:15] s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 // GFX1200: s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xff,0x8a,0x91,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_not1_b64 s[10:11], ttmp[14:15], lit64(0xaf123456) ; encoding: [0x7a,0xfe,0x8a,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_and_not1_b64 s[10:11], exec, src_scc // GFX12: s_and_not1_b64 s[10:11], exec, src_scc ; encoding: [0x7e,0xfd,0x8a,0x91] @@ -6298,7 +6328,8 @@ s_and_not1_b64 exec, src_scc, exec s_and_not1_b64 null, 0xaf123456, vcc // GFX1200: s_and_not1_b64 null, 0xaf123456, vcc ; encoding: [0xff,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf] -// GFX1250: s_and_not1_b64 null, 0xaf123456, vcc ; encoding: [0xfe,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_and_not1_b64 null, 0xaf123456, vcc ; encoding: [0xfe,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_and_not1_b64 null, lit64(0xaf123456), vcc ; encoding: [0xfe,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_or_not1_b64 s[10:11], s[2:3], s[4:5] // GFX12: s_or_not1_b64 s[10:11], s[2:3], s[4:5] ; encoding: [0x02,0x04,0x8a,0x92] @@ -6311,7 +6342,8 @@ s_or_not1_b64 s[10:11], vcc, ttmp[14:15] s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 // GFX1200: s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xff,0x8a,0x92,0x56,0x34,0x12,0xaf] -// GFX1250: s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_or_not1_b64 s[10:11], ttmp[14:15], lit64(0xaf123456) ; encoding: [0x7a,0xfe,0x8a,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_or_not1_b64 s[10:11], exec, src_scc // GFX12: s_or_not1_b64 s[10:11], exec, src_scc ; encoding: [0x7e,0xfd,0x8a,0x92] @@ -6330,4 +6362,5 @@ s_or_not1_b64 exec, src_scc, exec s_or_not1_b64 null, 0xaf123456, vcc // GFX1200: s_or_not1_b64 null, 0xaf123456, vcc ; encoding: [0xff,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf] -// GFX1250: s_or_not1_b64 null, 0xaf123456, vcc ; encoding: [0xfe,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_or_not1_b64 null, 0xaf123456, vcc ; encoding: [0xfe,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_or_not1_b64 null, lit64(0xaf123456), vcc ; encoding: [0xfe,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s index 98bb3c3..8056cef 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1200 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1250 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1250-ASM %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX12,GFX1250-DIS %s s_cmp_lt_f32 s1, s2 // GFX12: s_cmp_lt_f32 s1, s2 ; encoding: [0x01,0x02,0x41,0xbf] @@ -2120,7 +2120,8 @@ s_cmp_eq_u64 s[0:1], 0x3f717273 s_cmp_eq_u64 s[0:1], 0xaf123456 // GFX1200: s_cmp_eq_u64 s[0:1], 0xaf123456 ; encoding: [0x00,0xff,0x10,0xbf,0x56,0x34,0x12,0xaf] -// GFX1250: s_cmp_eq_u64 s[0:1], 0xaf123456 ; encoding: [0x00,0xfe,0x10,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_cmp_eq_u64 s[0:1], 0xaf123456 ; encoding: [0x00,0xfe,0x10,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_cmp_eq_u64 s[0:1], lit64(0xaf123456) ; encoding: [0x00,0xfe,0x10,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] s_cmp_lg_u64 s[0:1], s[2:3] // GFX12: s_cmp_lg_u64 s[0:1], s[2:3] ; encoding: [0x00,0x02,0x11,0xbf] @@ -2163,4 +2164,5 @@ s_cmp_lg_u64 s[0:1], 0x3f717273 s_cmp_lg_u64 s[0:1], 0xaf123456 // GFX1200: s_cmp_lg_u64 s[0:1], 0xaf123456 ; encoding: [0x00,0xff,0x11,0xbf,0x56,0x34,0x12,0xaf] -// GFX1250: s_cmp_lg_u64 s[0:1], 0xaf123456 ; encoding: [0x00,0xfe,0x11,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-ASM: s_cmp_lg_u64 s[0:1], 0xaf123456 ; encoding: [0x00,0xfe,0x11,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1250-DIS: s_cmp_lg_u64 s[0:1], lit64(0xaf123456) ; encoding: [0x00,0xfe,0x11,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/lit.local.cfg b/llvm/test/MC/AMDGPU/lit.local.cfg index c5853ad..12a5c8a 100644 --- a/llvm/test/MC/AMDGPU/lit.local.cfg +++ b/llvm/test/MC/AMDGPU/lit.local.cfg @@ -1,4 +1,4 @@ -config.substitutions.append(("%extract-encodings", "sed 's/.*encoding://p'")) +config.substitutions.append(("%extract-encodings", "sed -n 's/.*encoding://p'")) if not "AMDGPU" in config.root.targets: config.unsupported = True diff --git a/llvm/test/MC/AMDGPU/offset-expr.s b/llvm/test/MC/AMDGPU/offset-expr.s index 92a9bf1b..7c3c71c 100644 --- a/llvm/test/MC/AMDGPU/offset-expr.s +++ b/llvm/test/MC/AMDGPU/offset-expr.s @@ -9,10 +9,10 @@ BB1: v_nop_e64 BB2: s_add_u32 vcc_lo, vcc_lo, (BB2-BB1)&4294967295 -// CHECK: s_add_u32 vcc_lo, vcc_lo, 8 // 000000000018: 806AFF6A 00000008 +// CHECK: s_add_u32 vcc_lo, vcc_lo, lit(0x8) // 000000000018: 806AFF6A 00000008 s_addc_u32 vcc_hi, vcc_hi, (BB2-BB1)>>32 -// CHECK: s_addc_u32 vcc_hi, vcc_hi, 0 // 000000000020: 826BFF6B 00000000 +// CHECK: s_addc_u32 vcc_hi, vcc_hi, lit(0x0) // 000000000020: 826BFF6B 00000000 s_add_u32 vcc_lo, vcc_lo, (BB0-BB1)&4294967295 -// CHECK: s_add_u32 vcc_lo, vcc_lo, -16 // 000000000028: 806AFF6A FFFFFFF0 +// CHECK: s_add_u32 vcc_lo, vcc_lo, lit(0xfffffff0) // 000000000028: 806AFF6A FFFFFFF0 s_addc_u32 vcc_hi, vcc_hi, (BB0-BB1)>>32 -// CHECK: s_addc_u32 vcc_hi, vcc_hi, -1 // 000000000030: 826BFF6B FFFFFFFF +// CHECK: s_addc_u32 vcc_hi, vcc_hi, lit(0xffffffff) // 000000000030: 826BFF6B FFFFFFFF diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt index d2da087..856d7c2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt @@ -40,8 +40,7 @@ # VI: v_add_f16_e32 v1, 0x41, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x00] 0xff 0x06 0x02 0x3e 0x41 0x00 0x00 0x01 -# FIXME: This should be able to round trip with literal after instruction -# VI: v_add_f16_e32 v1, 0, v3 ; encoding: [0x80,0x06,0x02,0x3e] +# VI: v_add_f16_e32 v1, lit(0x0), v3 ; encoding: [0xff,0x06,0x02,0x3e,0x00,0x00,0x00,0x00] 0xff 0x06 0x02 0x3e 0x00 0x00 0x00 0x00 # VI: v_add_f16_e32 v1, 0xffcd, v3 ; encoding: [0xff,0x06,0x02,0x3e,0xcd,0xff,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/literals.txt b/llvm/test/MC/Disassembler/AMDGPU/literals.txt new file mode 100644 index 0000000..bd013a1 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/literals.txt @@ -0,0 +1,30 @@ +# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s + +0xff,0x94,0xfe,0x7e,0x01,0x00,0x00,0x00 +# GFX1250: v_tanh_bf16_e32 v127.l, lit(0x1) ; encoding: [0xff,0x94,0xfe,0x7e,0x01,0x00,0x00,0x00] + +0xff,0xd3,0x23,0xcc,0xff,0xd6,0x00,0x68,0x01,0x00,0x00,0x00 +# GFX1250: v_pk_add_bf16 v255, lit(0x1), vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x23,0xcc,0xff,0xd6,0x00,0x68,0x01,0x00,0x00,0x00] + +0xff,0x3e,0xfe,0x7e,0x01,0x00,0x00,0x00 +# GFX1250: v_tanh_f16_e32 v127.l, lit(0x1) ; encoding: [0xff,0x3e,0xfe,0x7e,0x01,0x00,0x00,0x00] + +0xff,0xfe,0xff,0x79,0x01,0x00,0x00,0x00 +# GFX1250: v_pk_fmac_f16 v255, lit(0x1), v255 ; encoding: [0xff,0xfe,0xff,0x79,0x01,0x00,0x00,0x00] + +# The immediate is always literal in this instruction. +0x01,0x00,0x73,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_bf8_f16 v1.l, 1 ; encoding: [0x01,0x00,0x73,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00] + +0xff,0xec,0x02,0x7e,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f16_bf8 v1, lit(0x1) ; encoding: [0xff,0xec,0x02,0x7e,0x01,0x00,0x00,0x00] + +0x0a,0x40,0x2d,0xcc,0xff,0x04,0x0e,0x1c,0x01,0x00,0x00,0x00 +# GFX1250: v_pk_add_min_i16 v10, lit(0x1), v2, v3 ; encoding: [0x0a,0x40,0x2d,0xcc,0xff,0x04,0x0e,0x1c,0x01,0x00,0x00,0x00] + +0xff,0x3c,0xfe,0x7f,0x01,0x00,0x00,0x00 +# GFX1250: v_tanh_f32_e32 v255, lit(0x1) ; encoding: [0xff,0x3c,0xfe,0x7f,0x01,0x00,0x00,0x00] + +0xff,0x3a,0xfc,0x7f,0x01,0x00,0x00,0x00 +# GFX1250: v_mov_b64_e32 v[254:255], lit(0x1) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00] diff --git a/llvm/test/Transforms/GVN/PRE/pre-load.ll b/llvm/test/Transforms/GVN/PRE/pre-load.ll index 5a07f9f..afa1354 100644 --- a/llvm/test/Transforms/GVN/PRE/pre-load.ll +++ b/llvm/test/Transforms/GVN/PRE/pre-load.ll @@ -1503,3 +1503,51 @@ wrong: exit: ret void } + +; Allow the load to be made available on the edge (%entry, %if.end) as part of PRE, +; but ensure `%identical.l` is not hoisted to its predecessor due to the local +; dependency with the call. + +define i32 @test24(ptr noalias %p, ptr noalias %q, i1 %c) { +; MDEP-LABEL: @test24( +; MDEP-NEXT: entry: +; MDEP-NEXT: br i1 [[C:%.*]], label [[ENTRY_IF_END_CRIT_EDGE:%.*]], label [[IF_THEN:%.*]] +; MDEP: entry.if.end_crit_edge: +; MDEP-NEXT: [[VV_PRE:%.*]] = load i32, ptr [[X:%.*]], align 4 +; MDEP-NEXT: br label [[IF_END:%.*]] +; MDEP: if.then: +; MDEP-NEXT: call void @opaque(ptr [[X]]) +; MDEP-NEXT: [[UU:%.*]] = load i32, ptr [[X]], align 4 +; MDEP-NEXT: store i32 [[UU]], ptr [[R:%.*]], align 4 +; MDEP-NEXT: br label [[IF_END]] +; MDEP: if.end: +; MDEP-NEXT: [[VV:%.*]] = phi i32 [ [[VV_PRE]], [[ENTRY_IF_END_CRIT_EDGE]] ], [ [[UU]], [[IF_THEN]] ] +; MDEP-NEXT: ret i32 [[VV]] +; +; MSSA-LABEL: @test24( +; MSSA-NEXT: entry: +; MSSA-NEXT: br i1 [[C:%.*]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; MSSA: if.then: +; MSSA-NEXT: call void @opaque(ptr [[X:%.*]]) +; MSSA-NEXT: [[UU:%.*]] = load i32, ptr [[X]], align 4 +; MSSA-NEXT: store i32 [[UU]], ptr [[R:%.*]], align 4 +; MSSA-NEXT: br label [[IF_END]] +; MSSA: if.end: +; MSSA-NEXT: [[VV:%.*]] = load i32, ptr [[X]], align 4 +; MSSA-NEXT: ret i32 [[VV]] +; +entry: + br i1 %c, label %if.end, label %if.then + +if.then: + call void @opaque(ptr %p) + %identical.l = load i32, ptr %p, align 4 + store i32 %identical.l, ptr %q, align 4 + br label %if.end + +if.end: + %l = load i32, ptr %p, align 4 + ret i32 %l +} + +declare void @opaque(ptr) nounwind willreturn diff --git a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll index cb4e07e..9b9bc68 100644 --- a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll +++ b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll @@ -60,8 +60,7 @@ define void @f_sadd_overflow(ptr %a) { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 2147483645, %[[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 2147483647 -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] +; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] ; CHECK: [[TRAP]]: ; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]] ; CHECK-NEXT: unreachable, !nosanitize [[META0]] @@ -150,8 +149,7 @@ define void @f_uadd_overflow(ptr %a) { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -6, %[[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], -1 -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] +; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] ; CHECK: [[TRAP]]: ; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]] ; CHECK-NEXT: unreachable, !nosanitize [[META0]] @@ -243,10 +241,7 @@ define void @f_ssub_overflow(ptr nocapture %a) { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -2147483642, %[[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[TMP0:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP1:%.*]] = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[TMP0]], i32 1) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1 -; CHECK-NEXT: br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] +; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] ; CHECK: [[TRAP]]: ; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]] ; CHECK-NEXT: unreachable, !nosanitize [[META0]] @@ -339,10 +334,7 @@ define void @f_usub_overflow(ptr nocapture %a) { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 15, %[[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP1:%.*]] = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[TMP0]], i32 1) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1 -; CHECK-NEXT: br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] +; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]] ; CHECK: [[TRAP]]: ; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]] ; CHECK-NEXT: unreachable, !nosanitize [[META0]] diff --git a/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll b/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll new file mode 100644 index 0000000..b9c9228 --- /dev/null +++ b/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll @@ -0,0 +1,738 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=indvars < %s | FileCheck %s + +define void @optimize_trap(i32 %block_size) { +; CHECK-LABEL: define void @optimize_trap( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[BLOCK_SIZE]], -1 +; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 3, [[UMIN]] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @llvm.trap() + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_atomic(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_atomic( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store atomic i8 [[TMP4]], ptr [[ARRAYIDX7]] unordered, align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @llvm.trap() + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store atomic i8 %1, ptr %arrayidx7 unordered, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_volatile(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_volatile( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @llvm.trap() + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store volatile i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_call(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_call( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: call void @x(ptr null) +; CHECK-NEXT: store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @llvm.trap() + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + call void @x(ptr null) + store volatile i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @optimize_ubsan_trap(i32 %block_size) { +; CHECK-LABEL: define void @optimize_ubsan_trap( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[BLOCK_SIZE]], -1 +; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 3, [[UMIN]] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @llvm.ubsantrap(i8 1) +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @llvm.ubsantrap(i8 1) + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_arbitrary_call(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_arbitrary_call( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @noreturn_with_argmem(ptr [[FOO_ARR]]) +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @noreturn_with_argmem(ptr %foo_arr) + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_two_exits(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_two_exits( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[P:%.*]] = call i1 @pred() +; CHECK-NEXT: br i1 [[P]], label %[[FOR_BODY_CONT:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; CHECK: [[FOR_BODY_CONT]]: +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @noreturn(ptr [[FOO_ARR]]) +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %p = call i1 @pred() + br i1 %p, label %for.body.cont, label %for.cond.cleanup.loopexit + +for.body.cont: ; preds = %for.body.preheader, %if.end4 + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @noreturn(ptr %foo_arr) + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_two_exits2(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_two_exits2( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_BODY_CONT:.*]] +; CHECK: [[FOR_BODY_CONT]]: +; CHECK-NEXT: [[P:%.*]] = call i1 @pred() +; CHECK-NEXT: br i1 [[P]], label %[[IF_END4]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: call void @noreturn(ptr [[FOO_ARR]]) +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %for.body.cont + +for.body.cont: ; preds = %for.body.preheader, %if.end4 + %p = call i1 @pred() + br i1 %p, label %if.end4, label %for.cond.cleanup.loopexit + +if.then: ; preds = %for.body + call void @noreturn(ptr %foo_arr) + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_depdendent_ubsan_trap(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_depdendent_ubsan_trap( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[I_015_LCSSA:%.*]] = phi i32 [ [[I_015]], %[[FOR_BODY]] ] +; CHECK-NEXT: call void @noreturn_with_i32(i32 [[I_015_LCSSA]]) +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + call void @noreturn_with_i32(i32 %i.015) + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @no_optimize_depdendent_load_trap(i32 %block_size) { +; CHECK-LABEL: define void @no_optimize_depdendent_load_trap( +; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16 +; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]]) +; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0 +; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]]) +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[I_015_LCSSA:%.*]] = load i8, ptr [[FOO_ARR]], align 1 +; CHECK-NEXT: call void @noreturn_with_i8(i8 [[I_015_LCSSA]]) +; CHECK-NEXT: unreachable +; CHECK: [[IF_END4]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]] +; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]] +; +entry: + %foo_arr = alloca [2 x i8], align 16 + %bar_arr = alloca [2 x i8], align 16 + call void @x(ptr nonnull %foo_arr) + %cmp14.not = icmp eq i32 %block_size, 0 + br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + call void @x(ptr nonnull %bar_arr) + ret void + +for.body: ; preds = %for.body.preheader, %if.end4 + %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ] + %cmp1 = icmp samesign ugt i32 %i.015, 2 + br i1 %cmp1, label %if.then, label %if.end4 + +if.then: ; preds = %for.body + %r = load i8, ptr %foo_arr, align 1 + call void @noreturn_with_i8(i8 %r) + unreachable + +if.end4: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015 + %0 = load i8, ptr %arrayidx, align 1 + %1 = xor i8 %0, 54 + %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015 + store i8 %1, ptr %arrayidx7, align 1 + %inc = add nuw nsw i32 %i.015, 1 + %cmp = icmp ult i32 %inc, %block_size + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + + +declare void @x(ptr noundef) local_unnamed_addr +declare i1 @pred() local_unnamed_addr + +declare void @llvm.trap() #0 +declare void @noreturn(ptr) #0 +declare void @noreturn_with_i32(i32) #0 +declare void @noreturn_with_i8(i8) #0 +declare void @noreturn_with_argmem(ptr) #1 + +attributes #0 = { cold noreturn nounwind memory(inaccessiblemem: write) } +attributes #1 = { cold noreturn nounwind memory(argmem: read) } diff --git a/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll b/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll index 7cc4446..ad45d1e 100644 --- a/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll +++ b/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll @@ -11,10 +11,8 @@ define i16 @test5(i16 %A) !dbg !34 { call void @llvm.dbg.value(metadata i32 %C, metadata !37, metadata !DIExpression()), !dbg !41 ; Preserve the dbg.value for the DCE'd 32-bit 'and'. - ; - ; The high 16 bits of the original 'and' require sign-extending the new 16-bit and: ; CHECK-NEXT: #dbg_value(i16 [[and]], [[C:![0-9]+]], - ; CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_signed, DW_OP_LLVM_convert, 32, DW_ATE_signed, DW_OP_stack_value) + ; CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_stack_value) %D = trunc i32 %C to i16, !dbg !42 call void @llvm.dbg.value(metadata i16 %D, metadata !38, metadata !DIExpression()), !dbg !42 diff --git a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll index 69b8f69..82ecbd4 100644 --- a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll +++ b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll @@ -86,3 +86,14 @@ define <4 x ptr> @test7(<4 x i128> %arg) nounwind { %p1 = inttoptr <4 x i128> %arg to <4 x ptr> ret <4 x ptr> %p1 } + +define i64 @ptrtoint_gep_sub(ptr %ptr, i64 %end.addr) { +; CHECK-LABEL: @ptrtoint_gep_sub( +; CHECK-NEXT: ret i64 [[END_ADDR:%.*]] +; + %ptr.addr = ptrtoint ptr %ptr to i64 + %size = sub i64 %end.addr, %ptr.addr + %end = getelementptr i8, ptr %ptr, i64 %size + %end.addr2 = ptrtoint ptr %end to i64 + ret i64 %end.addr2 +} diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll index ed9fba3..22ab79d 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll @@ -289,6 +289,225 @@ define void @interleave_wide_nxdouble_factor2(ptr %ptr, <vscale x 4 x double> %l ret void } +define void @deinterleave1_nxi64_factor3(ptr %ptr, <vscale x 4 x i64>* %s1, <vscale x 4 x i64>* %s2, <vscale x 4 x i64>* %s3) #0 { +; CHECK-LABEL: define void @deinterleave1_nxi64_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3 +; CHECK-NEXT: [[LDN2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP9]], i64 2) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP11]], i64 2) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP13]], i64 2) +; CHECK-NEXT: [[TMP15:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } poison, <vscale x 4 x i64> [[TMP10]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP15]], <vscale x 4 x i64> [[TMP12]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP16]], <vscale x 4 x i64> [[TMP14]], 2 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 2 +; CHECK-NEXT: store <vscale x 4 x i64> [[TMP18]], ptr [[S1]], align 32 +; CHECK-NEXT: store <vscale x 4 x i64> [[TMP19]], ptr [[S2]], align 32 +; CHECK-NEXT: store <vscale x 4 x i64> [[TMP20]], ptr [[S3]], align 32 +; CHECK-NEXT: ret void +; + %wide.vec = load <vscale x 12 x i64>, ptr %ptr, align 8 + %ldN = tail call { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave3.nxv12i64(<vscale x 12 x i64> %wide.vec) + + %3 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 0 + %4 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 1 + %5 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 2 + + store <vscale x 4 x i64> %3, <vscale x 4 x i64>* %s1 + store <vscale x 4 x i64> %4, <vscale x 4 x i64>* %s2 + store <vscale x 4 x i64> %5, <vscale x 4 x i64>* %s3 + ret void +} + +define void @deinterleave2_nxi64_factor3(ptr %ptr, <vscale x 8 x i64>* %s1, <vscale x 8 x i64>* %s2, <vscale x 8 x i64>* %s3) #0 { +; CHECK-LABEL: define void @deinterleave2_nxi64_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3 +; CHECK-NEXT: [[LDN2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP9]], i64 2) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP11]], i64 2) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP13]], i64 2) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 6 +; CHECK-NEXT: [[LDN3:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP15]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP16]], i64 4) +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP18]], i64 4) +; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 2 +; CHECK-NEXT: [[TMP21:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP20]], i64 4) +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 9 +; CHECK-NEXT: [[LDN4:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP22]]) +; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP17]], <vscale x 2 x i64> [[TMP23]], i64 6) +; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 1 +; CHECK-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP19]], <vscale x 2 x i64> [[TMP25]], i64 6) +; CHECK-NEXT: [[TMP27:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 2 +; CHECK-NEXT: [[TMP28:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP21]], <vscale x 2 x i64> [[TMP27]], i64 6) +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } poison, <vscale x 8 x i64> [[TMP24]], 0 +; CHECK-NEXT: [[TMP30:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP29]], <vscale x 8 x i64> [[TMP26]], 1 +; CHECK-NEXT: [[TMP31:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP30]], <vscale x 8 x i64> [[TMP28]], 2 +; CHECK-NEXT: [[TMP32:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 0 +; CHECK-NEXT: [[TMP33:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 1 +; CHECK-NEXT: [[TMP34:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 2 +; CHECK-NEXT: store <vscale x 8 x i64> [[TMP32]], ptr [[S1]], align 64 +; CHECK-NEXT: store <vscale x 8 x i64> [[TMP33]], ptr [[S2]], align 64 +; CHECK-NEXT: store <vscale x 8 x i64> [[TMP34]], ptr [[S3]], align 64 +; CHECK-NEXT: ret void +; + %wide.vec = load <vscale x 24 x i64>, ptr %ptr, align 8 + %ldN = tail call { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } @llvm.vector.deinterleave3.nxv24i64(<vscale x 24 x i64> %wide.vec) + + %3 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 0 + %4 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 1 + %5 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 2 + + store <vscale x 8 x i64> %3, <vscale x 8 x i64>* %s1 + store <vscale x 8 x i64> %4, <vscale x 8 x i64>* %s2 + store <vscale x 8 x i64> %5, <vscale x 8 x i64>* %s3 + ret void +} + +define void @deinterleave_neg1_nxi64_factor3(ptr %ptr, <vscale x 1 x i64>* %s1, <vscale x 1 x i64>* %s2, <vscale x 1 x i64>* %s3) #0 { +; CHECK-LABEL: define void @deinterleave_neg1_nxi64_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 3 x i64>, ptr [[PTR]], align 8 +; CHECK-NEXT: [[LDN:%.*]] = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 2 +; CHECK-NEXT: store <vscale x 1 x i64> [[TMP1]], ptr [[S1]], align 8 +; CHECK-NEXT: store <vscale x 1 x i64> [[TMP2]], ptr [[S2]], align 8 +; CHECK-NEXT: store <vscale x 1 x i64> [[TMP3]], ptr [[S3]], align 8 +; CHECK-NEXT: ret void +; + %wide.vec = load <vscale x 3 x i64>, ptr %ptr, align 8 + %ldN = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> %wide.vec) + + %3 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 0 + %4 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 1 + %5 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 2 + + store <vscale x 1 x i64> %3, <vscale x 1 x i64>* %s1 + store <vscale x 1 x i64> %4, <vscale x 1 x i64>* %s2 + store <vscale x 1 x i64> %5, <vscale x 1 x i64>* %s3 + ret void +} + +define void @deinterleave_neg2_nxi8_factor3(ptr %ptr, <vscale x 8 x i8>* %s1, <vscale x 8 x i8>* %s2, <vscale x 8 x i8>* %s3) #0 { +; CHECK-LABEL: define void @deinterleave_neg2_nxi8_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 24 x i8>, ptr [[PTR]], align 8 +; CHECK-NEXT: [[LDN:%.*]] = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv24i8(<vscale x 24 x i8> [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 2 +; CHECK-NEXT: store <vscale x 8 x i8> [[TMP1]], ptr [[S1]], align 8 +; CHECK-NEXT: store <vscale x 8 x i8> [[TMP2]], ptr [[S2]], align 8 +; CHECK-NEXT: store <vscale x 8 x i8> [[TMP3]], ptr [[S3]], align 8 +; CHECK-NEXT: ret void +; + %wide.vec = load <vscale x 24 x i8>, ptr %ptr, align 8 + %ldN = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv12i8(<vscale x 24 x i8> %wide.vec) + + %3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 0 + %4 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 1 + %5 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 2 + + store <vscale x 8 x i8> %3, <vscale x 8 x i8>* %s1 + store <vscale x 8 x i8> %4, <vscale x 8 x i8>* %s2 + store <vscale x 8 x i8> %5, <vscale x 8 x i8>* %s3 + ret void +} + +define void @interleave1_nxi64_factor3(ptr %ptr, <vscale x 8 x i64> %s1, <vscale x 8 x i64> %s2, <vscale x 8 x i64> %s3) #0 { +; CHECK-LABEL: define void @interleave1_nxi64_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 8 x i64> [[S1:%.*]], <vscale x 8 x i64> [[S2:%.*]], <vscale x 8 x i64> [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3 +; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 2) +; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 2) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP5]]) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 6 +; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 4) +; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 4) +; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 4) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP9]]) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 9 +; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 6) +; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 6) +; CHECK-NEXT: [[TMP16:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 6) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP13]]) +; CHECK-NEXT: ret void +; + %interleave = tail call <vscale x 24 x i64> @llvm.vector.interleave3.nxv24i64(<vscale x 8 x i64> %s1, <vscale x 8 x i64> %s2, <vscale x 8 x i64> %s3) + + store <vscale x 24 x i64> %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave2_nxi64_factor3(ptr %ptr, <vscale x 4 x i64> %s1, <vscale x 4 x i64> %s2, <vscale x 4 x i64> %s3) #0 { +; CHECK-LABEL: define void @interleave2_nxi64_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 4 x i64> [[S1:%.*]], <vscale x 4 x i64> [[S2:%.*]], <vscale x 4 x i64> [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S3]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3 +; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S1]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S2]], i64 2) +; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S3]], i64 2) +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP5]]) +; CHECK-NEXT: ret void +; + %interleave = tail call <vscale x 12 x i64> @llvm.vector.interleave3.nxv12i64(<vscale x 4 x i64> %s1, <vscale x 4 x i64> %s2, <vscale x 4 x i64> %s3) + + store <vscale x 12 x i64> %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_neg_nxi8_factor3(ptr %ptr, <vscale x 8 x i8> %s1, <vscale x 8 x i8> %s2, <vscale x 8 x i8> %s3) #0 { +; CHECK-LABEL: define void @interleave_neg_nxi8_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 8 x i8> [[S1:%.*]], <vscale x 8 x i8> [[S2:%.*]], <vscale x 8 x i8> [[S3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[INTERLEAVE:%.*]] = tail call <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8> [[S1]], <vscale x 8 x i8> [[S2]], <vscale x 8 x i8> [[S3]]) +; CHECK-NEXT: store <vscale x 24 x i8> [[INTERLEAVE]], ptr [[PTR]], align 4 +; CHECK-NEXT: ret void +; + %interleave = tail call <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8> %s1, <vscale x 8 x i8> %s2, <vscale x 8 x i8> %s3) + + store <vscale x 24 x i8> %interleave, ptr %ptr, align 4 + ret void +} + declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>) declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>) declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>) @@ -312,4 +531,15 @@ declare <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, < ; Larger interleaves to test 'legalization' declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>) +; De-Interleaves with Factor=3 +declare { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64>) +declare { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave3.nxv12i64(<vscale x 12 x i64>) +declare { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } @llvm.vector.deinterleave3.nxv24i64(<vscale x 24 x i64>) +declare { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv24i8(<vscale x 24 x i8>) + +; Interleaves with Factor=3 +declare <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>) +declare <vscale x 24 x i64> @llvm.vector.interleave3.nxv24i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64>) +declare <vscale x 12 x i64> @llvm.vector.interleave3.nxv12i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64>) + attributes #0 = { vscale_range(1,16) "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll index 279d4e8..83623fd 100644 --- a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll +++ b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll @@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ;CHECK-LABEL: @foo( ;CHECK: icmp eq <4 x i32> ;CHECK: select <4 x i1> -;CHECK: ret i32 -define i32 @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp { +;CHECK: ret void +define void @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp { entry: %cmp10 = icmp sgt i32 %x, 0 br i1 %cmp10, label %for.body, label %for.end @@ -35,5 +35,5 @@ if.end: ; preds = %for.body, %if.then br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %if.end, %entry - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 6cf11be..6fe6883 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -660,16 +660,17 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) { ; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1 ; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]] ; COMMON: [[PRED_STORE_CONTINUE12]]: -; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT1:.*]] +; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]] ; COMMON: [[PRED_STORE_IF13]]: ; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7 ; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1 -; COMMON-NEXT: br label %[[EXIT1]] +; COMMON-NEXT: br label %[[EXIT]] +; COMMON: [[EXIT]]: +; COMMON-NEXT: br label %[[SCALAR_PH:.*]] +; COMMON: [[SCALAR_PH]]: +; COMMON-NEXT: br label %[[EXIT1:.*]] ; COMMON: [[EXIT1]]: -; COMMON-NEXT: br label %[[SCALAR_PH1:.*]] -; COMMON: [[SCALAR_PH1]]: -; COMMON-NEXT: br [[EXIT:label %.*]] -; COMMON: [[SCALAR_PH:.*:]] +; COMMON-NEXT: ret void ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll index 93e71af..e3e4833 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll @@ -42,7 +42,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3 ; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -80,7 +80,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3 ; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]] ; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]]) ; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] @@ -104,7 +104,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3 ; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17 -; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ] ; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] @@ -167,13 +167,13 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no ; CHECK-NEXT: [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]]) ; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -211,7 +211,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no ; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]] ; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]]) ; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] @@ -235,7 +235,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no ; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17 -; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ] ; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] @@ -308,7 +308,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-NEXT: store i64 0, ptr [[L]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4 -; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH:.*]] ; CHECK: [[SCALAR_PH]]: @@ -332,7 +332,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-NEXT: store i64 0, ptr [[L]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 2 ; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV1]], 14 -; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll index e424649..75b18ff 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll @@ -541,3 +541,22 @@ exit: ; preds = %for.body ; CHECK-VS1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK-VS1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK-VS1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VS1: [[PROF3]] = !{!"branch_weights", i32 8, i32 8} +; CHECK-VS1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VS1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} +; CHECK-VS1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-VS1: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} +; CHECK-VS1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-VS1: [[PROF9]] = !{!"branch_weights", i32 10, i32 30} +;. +; CHECK-VS2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VS2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VS2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VS2: [[PROF3]] = !{!"branch_weights", i32 8, i32 8} +; CHECK-VS2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VS2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} +; CHECK-VS2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-VS2: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} +; CHECK-VS2: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-VS2: [[PROF9]] = !{!"branch_weights", i32 10, i32 30} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll index a6e0f8a..300f5d9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll @@ -40,6 +40,7 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali ; CHECK-ARMPL: [[ENTRY:.*:]] ; CHECK-ARMPL: [[VECTOR_PH:.*:]] ; CHECK-ARMPL: [[VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[VECTOR_BODY1:.*:]] ; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]]) ; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]]) ; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0 @@ -53,6 +54,15 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali ; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]] ; CHECK-ARMPL: [[SCALAR_PH:.*:]] ; CHECK-ARMPL: [[FOR_BODY:.*:]] +; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> [[WIDE_LOAD3:%.*]]) +; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0 +; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1 +; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4 +; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4 +; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]] +; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]] +; CHECK-ARMPL: [[FOR_BODY1:.*:]] ; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]]) ; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 ; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 @@ -262,6 +272,7 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias ; CHECK-ARMPL: [[ENTRY:.*:]] ; CHECK-ARMPL: [[VECTOR_PH:.*:]] ; CHECK-ARMPL: [[VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[VECTOR_BODY1:.*:]] ; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.modf.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]]) ; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.modf.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]]) ; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0 @@ -275,6 +286,15 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias ; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]] ; CHECK-ARMPL: [[SCALAR_PH:.*:]] ; CHECK-ARMPL: [[FOR_BODY:.*:]] +; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.modf.v4f32(<4 x float> [[WIDE_LOAD3:%.*]]) +; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0 +; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1 +; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4 +; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4 +; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]] +; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]] +; CHECK-ARMPL: [[FOR_BODY1:.*:]] ; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.modf.f32(float [[IN_VAL:%.*]]) ; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 ; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 @@ -412,6 +432,7 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa ; CHECK-ARMPL: [[ENTRY:.*:]] ; CHECK-ARMPL: [[VECTOR_PH:.*:]] ; CHECK-ARMPL: [[VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[VECTOR_BODY1:.*:]] ; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincospi.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]]) ; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincospi.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]]) ; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0 @@ -425,6 +446,15 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa ; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]] ; CHECK-ARMPL: [[SCALAR_PH:.*:]] ; CHECK-ARMPL: [[FOR_BODY:.*:]] +; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]] +; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincospi.v4f32(<4 x float> [[WIDE_LOAD3:%.*]]) +; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0 +; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1 +; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4 +; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4 +; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]] +; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]] +; CHECK-ARMPL: [[FOR_BODY1:.*:]] ; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.sincospi.f32(float [[IN_VAL:%.*]]) ; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 ; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll index 8830ce3..5f79d02 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll @@ -38,8 +38,9 @@ define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out ; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true ; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[EXIT:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %for.body @@ -96,8 +97,9 @@ define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out ; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true ; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[EXIT:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll index d447517..f03f743 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll @@ -29,8 +29,9 @@ define void @trip7_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[COND:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true ; CHECK-NEXT: br i1 [[COND]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[EXIT:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll index b8f4e84..753847f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -33,8 +33,9 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) # ; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true ; CHECK-NEXT: br i1 [[FIRST_LANE_SET]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[FOR_END:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[FOR_END:.*]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void ; entry: @@ -87,8 +88,9 @@ define void @cond_uniform_load(ptr noalias nocapture %dst, ptr nocapture readonl ; CHECK-NEXT: [[TMP9:%.*]] = xor i1 [[TMP8]], true ; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[FOR_END:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[FOR_END:.*]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll index 596e42e..d0c1194 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll @@ -36,7 +36,7 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond.not, label %exit, label %for.body } -define i32 @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 { +define void @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 { ; CHECK-COST-2: LV: Found an estimated cost of 0 for VF 1 For instruction: %i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ] ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.032 @@ -70,7 +70,7 @@ for.cond.cleanup.loopexit: ; preds = %if.end br label %for.cond.cleanup for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - ret i32 undef + ret void for.body: ; preds = %for.body.preheader, %if.end %i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll index e046816..e84c0d6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll @@ -67,7 +67,7 @@ define void @test_may_clobber(ptr %p) { ; CHECK-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: @@ -111,7 +111,7 @@ define void @trivial_due_max_vscale(ptr %p) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: @@ -155,7 +155,7 @@ define void @no_high_lmul_or_interleave(ptr %p) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: @@ -207,7 +207,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 24) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -221,7 +221,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) { ; CHECK-NEXT: store i16 0, ptr [[GEP_OFF]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 3 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index 2fbc73e..c66d8d6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -133,7 +133,7 @@ define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: @@ -237,7 +237,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) { ; CHECK-NEXT: [[TMP12:%.*]] = mul i64 8, [[TMP9]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: @@ -346,7 +346,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; NOSTRIDED: middle.block: ; NOSTRIDED-NEXT: br label [[EXIT:%.*]] ; NOSTRIDED: scalar.ph: @@ -360,7 +360,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; NOSTRIDED: exit: ; NOSTRIDED-NEXT: ret void ; @@ -468,7 +468,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; NOSTRIDED: middle.block: ; NOSTRIDED-NEXT: br label [[EXIT:%.*]] ; NOSTRIDED: scalar.ph: @@ -483,7 +483,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]] ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; NOSTRIDED: exit: ; NOSTRIDED-NEXT: ret void ; @@ -640,7 +640,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]] ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] ; NOSTRIDED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; NOSTRIDED: middle.block: ; NOSTRIDED-NEXT: br label [[EXIT:%.*]] ; NOSTRIDED: scalar.ph: @@ -656,7 +656,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4 ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; NOSTRIDED: exit: ; NOSTRIDED-NEXT: ret void ; @@ -790,14 +790,14 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT9]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer ; STRIDED-NEXT: [[TMP18:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT1]] ; STRIDED-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP18]] -; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META6:![0-9]+]] +; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META5:![0-9]+]] ; STRIDED-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1) ; STRIDED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP18]] -; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META9:![0-9]+]], !noalias [[META6]] +; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META8:![0-9]+]], !noalias [[META5]] ; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP44]] ; STRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; STRIDED-NEXT: [[TMP41:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; STRIDED-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; STRIDED-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; STRIDED: middle.block: ; STRIDED-NEXT: br label [[EXIT:%.*]] ; STRIDED: scalar.ph: @@ -813,7 +813,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4 ; STRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; STRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] +; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; STRIDED: exit: ; STRIDED-NEXT: ret void ; @@ -965,7 +965,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; NOSTRIDED: middle.block: ; NOSTRIDED-NEXT: br label [[EXIT:%.*]] ; NOSTRIDED: scalar.ph: @@ -981,7 +981,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]] ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; NOSTRIDED: exit: ; NOSTRIDED-NEXT: ret void ; @@ -1145,16 +1145,16 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI11]], <vscale x 4 x i64> [[TMP18]] ; STRIDED-NEXT: [[VECTOR_GEP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[TMP18]] ; STRIDED-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META13:![0-9]+]] +; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META12:![0-9]+]] ; STRIDED-NEXT: [[TMP30:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1) -; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP30]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META16:![0-9]+]], !noalias [[META13]] +; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP30]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META15:![0-9]+]], !noalias [[META12]] ; STRIDED-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 ; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] ; STRIDED-NEXT: [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP16]] ; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP25]] ; STRIDED-NEXT: [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP25]] ; STRIDED-NEXT: [[TMP23:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; STRIDED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; STRIDED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; STRIDED: middle.block: ; STRIDED-NEXT: br label [[EXIT:%.*]] ; STRIDED: scalar.ph: @@ -1170,7 +1170,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[PTR2_NEXT]] = getelementptr inbounds i8, ptr [[PTR2]], i64 [[STRIDE]] ; STRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; STRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] ; STRIDED: exit: ; STRIDED-NEXT: ret void ; @@ -1318,7 +1318,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]] ; NOSTRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; NOSTRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; NOSTRIDED: middle.block: ; NOSTRIDED-NEXT: br label [[LOOP:%.*]] ; NOSTRIDED: exit: @@ -1402,7 +1402,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]] ; STRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; STRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; STRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; STRIDED: middle.block: ; STRIDED-NEXT: br label [[LOOP:%.*]] ; STRIDED: exit: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll index 0c22a9e..46daee4 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll @@ -142,7 +142,7 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]]) ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] @@ -267,7 +267,7 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]] ; IF-EVL-NEXT: [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: exit: @@ -382,7 +382,7 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]]) ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] @@ -508,7 +508,7 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]] ; IF-EVL-NEXT: [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: exit: @@ -621,7 +621,7 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]]) ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index bae97e5..c34417b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -129,7 +129,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] @@ -143,7 +143,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: [[FOR_END]]: ; SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ] ; SCALABLE-NEXT: ret i64 [[V_LCSSA]] @@ -204,7 +204,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP3]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] @@ -218,7 +218,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: store i64 [[V1]], ptr [[ARRAYIDX1]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V1]], %[[FOR_BODY]] ], [ [[V]], %[[MIDDLE_BLOCK]] ] ; TF-SCALABLE-NEXT: ret i64 [[V_LCSSA]] @@ -269,7 +269,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]] ; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -350,7 +350,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]] ; TF-SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -399,7 +399,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -457,7 +457,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]] ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -499,7 +499,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] ; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -557,7 +557,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]] ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -608,7 +608,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]] ; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -679,7 +679,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -731,7 +731,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -812,7 +812,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; TF-SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: @@ -860,7 +860,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] ; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[FOR_END]]: @@ -918,7 +918,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]] ; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[FOR_END]]: diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll index 9e20586..44fb8cb 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll @@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0" ; CHECK-LABEL: @read_mod_write_single_ptr( ; CHECK: load <8 x float> -; CHECK: ret i32 -define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp { +; CHECK: ret void +define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -23,15 +23,15 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } ; CHECK-LABEL: @read_mod_i64( ; SLOWMEM32: load <2 x i64> ; FASTMEM32: load <4 x i64> -; CHECK: ret i32 -define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp { +; CHECK: ret void +define void @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -47,6 +47,6 @@ define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp { br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll index e11b1ad..27d5e64 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -166,7 +166,6 @@ attributes #0 = { "target-cpu"="knl" } ; CHECK: LV: Found uniform instruction: {{%.*}} = icmp eq i32 {{%.*}}, 0 ; CHECK-NOT: LV: Found uniform instruction: {{%.*}} = load i32, ptr {{%.*}}, align 1 ; CHECK: LV: Found not uniform due to requiring predication: {{%.*}} = load i32, ptr {{%.*}}, align 1 -; CHECK: LV: Found scalar instruction: {{%.*}} = getelementptr inbounds [3 x i32], ptr @a, i32 0, i32 {{%.*}} ; ; @a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1 @@ -215,8 +214,9 @@ define void @PR40816() #1 { ; FORCE-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 ; FORCE-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; FORCE: [[MIDDLE_BLOCK]]: -; FORCE-NEXT: br [[RETURN:label %.*]] -; FORCE: [[SCALAR_PH:.*:]] +; FORCE-NEXT: br label %[[RETURN:.*]] +; FORCE: [[RETURN]]: +; FORCE-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll index 6d2cda4..0287645 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp { +define void @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp { ; CHECK-LABEL: @conversion_cost1( ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 3 ; CHECK-NEXT: br i1 [[TMP1]], label [[ITER_CHECK:%.*]], label [[DOT_CRIT_EDGE:%.*]] @@ -37,7 +37,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END5:%.*]] = add i64 3, [[N_VEC]] ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -58,7 +58,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <4 x i8> [[VEC_IND8]], splat (i8 4) ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N12]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -73,11 +73,11 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: ._crit_edge.loopexit: ; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] ; CHECK: ._crit_edge: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; %1 = icmp sgt i32 %n, 3 br i1 %1, label %.lr.ph, label %._crit_edge @@ -93,10 +93,10 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } -define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp { +define void @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp { ; CHECK-LABEL: @conversion_cost2( ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 9 ; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]] @@ -136,7 +136,7 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_3]], splat (i64 2) ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -152,11 +152,11 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: ._crit_edge.loopexit: ; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] ; CHECK: ._crit_edge: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; %1 = icmp sgt i32 %n, 9 br i1 %1, label %.lr.ph, label %._crit_edge @@ -173,5 +173,5 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 9453ad7..725fa49 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -540,6 +540,8 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i64 [[N:%.*]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -551,14 +553,6 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: [[TMP8]] = add <2 x i64> [[VEC_PHI2]], splat (i64 1) ; CHECK-NEXT: [[TMP9]] = add <2 x i64> [[VEC_PHI3]], splat (i64 1) ; CHECK-NEXT: [[TMP10]] = add <2 x i64> [[VEC_PHI4]], splat (i64 1) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll index af5c921..fa3b4a66 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll @@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux" ;CHECK-LABEL: func1x6( ;CHECK: <4 x i32> ;CHECK: ret -define i32 @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) { +define void @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) { entry: br label %for.body @@ -40,14 +40,14 @@ for.body: ; preds = %for.body, %entry br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body - ret i32 undef + ret void } ; We are vectorizing with 12 runtime checks. ;CHECK-LABEL: func2x6( ;CHECK: <4 x i32> ;CHECK: ret -define i32 @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) { +define void @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) { entry: br label %for.body @@ -85,5 +85,5 @@ for.body: ; preds = %for.body, %entry br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll index 8971dfe..47355e7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll @@ -11,7 +11,7 @@ target triple = "x86_64-apple-macosx10.8.0" ;CHECK-NOUNRL: store <4 x i32> ;CHECK-NOUNRL-NOT: store <4 x i32> ;CHECK-NOUNRL: ret -define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp { +define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -27,5 +27,5 @@ define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp { br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll index 28de5c7..56f0b85 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -58,7 +58,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] @@ -72,7 +72,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly ; CHECK-NEXT: store <4 x float> [[TMP21]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 4 ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 20 -; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: @@ -88,7 +88,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly ; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -132,14 +132,14 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 19) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7:![0-9]+]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8:![0-9]+]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8]] ; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP7]] +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.end: @@ -180,14 +180,14 @@ define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP7]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP8]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP8]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]] +; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll index 65c12a1..224ec4a6 100644 --- a/llvm/test/Transforms/LoopVectorize/assume.ll +++ b/llvm/test/Transforms/LoopVectorize/assume.ll @@ -34,8 +34,9 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600 ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[FOR_END:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[FOR_END:.*]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void ; entry: br label %for.body @@ -73,29 +74,28 @@ define void @test2(ptr noalias %a, ptr noalias %b) { ; CHECK-NEXT: [[MASKCOND4:%.*]] = icmp eq i64 [[MASKEDPTR3]], 0 ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 2 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00) ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 2 ; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[TMP7]], align 4 ; CHECK-NEXT: store <2 x float> [[TMP6]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600 -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[FOR_END:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[FOR_END:.*]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void ; entry: %ptrint = ptrtoint ptr %a to i64 @@ -163,7 +163,7 @@ define void @predicated_assume(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_COND_CLEANUP_LOOPEXIT:label %.*]], label %[[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll index f64255f..b7aa958 100644 --- a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll +++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll @@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; When scalarizing stores we need to preserve the original order. ; Make sure that we are extracting in the correct order (0101, and not 0011). -define i32 @foo(ptr nocapture %A) { +define void @foo(ptr nocapture %A) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] @@ -39,7 +39,7 @@ define i32 @foo(ptr nocapture %A) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; entry: br label %for.body @@ -55,7 +55,7 @@ for.body: br i1 %exitcond, label %for.end, label %for.body for.end: - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll index 1588d02..51255b2 100644 --- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) { +define void @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -73,7 +73,7 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; entry: %cmp26 = icmp sgt i32 %n, 0 @@ -106,11 +106,11 @@ if.end14: br i1 %exitcond, label %for.end, label %for.body for.end: - ret i32 undef + ret void } ; As above but with multiple variables set per block. -define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) { +define void @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK-LABEL: @multi_variable_if_nest( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -188,7 +188,7 @@ define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; entry: %cmp26 = icmp sgt i32 %n, 0 @@ -224,5 +224,5 @@ if.end14: br i1 %exitcond, label %for.end, label %for.body for.end: - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion.ll b/llvm/test/Transforms/LoopVectorize/if-conversion.ll index 8a7f4a3..a88a9b14 100644 --- a/llvm/test/Transforms/LoopVectorize/if-conversion.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion.ll @@ -17,8 +17,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; } ;} -define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp { -; CHECK-LABEL: define i32 @function0( +define void @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp { +; CHECK-LABEL: define void @function0( ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[START:%.*]], i32 [[END:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP16:%.*]] = icmp slt i32 [[START]], [[END]] @@ -94,7 +94,7 @@ define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) ; CHECK: [[FOR_END_LOOPEXIT]]: ; CHECK-NEXT: br label %[[FOR_END]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; entry: %cmp16 = icmp slt i32 %start, %end @@ -127,7 +127,7 @@ if.end: br i1 %cmp, label %for.body, label %for.end for.end: - ret i32 undef + ret void } @@ -237,6 +237,8 @@ for.end: ; preds = %for.inc, %entry ; Handle PHI with single incoming value having a full mask. ; PR34523 +; NOTE: Changing PHI inputs from undef to poison leads to change in +; behaviour of the test. Left as undef for now. define void @PR34523() { ; CHECK-LABEL: define void @PR34523() { ; CHECK-NEXT: [[BB1:.*:]] diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll index 742ee64..eea2237 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -337,7 +337,7 @@ for.end: ; preds = %for.body ; } ; } -define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 { +define void @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 { ; CHECK-LABEL: @multiple_uniform_stores( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0 @@ -429,7 +429,7 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly ; CHECK: for.end10.loopexit: ; CHECK-NEXT: br label [[FOR_END10]] ; CHECK: for.end10: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; entry: %cmp20 = icmp eq i32 %itr, 0 @@ -469,12 +469,12 @@ for.inc8: ; preds = %for.body3, %for.con br i1 %exitcond26, label %for.end10, label %for.cond1.preheader for.end10: ; preds = %for.inc8, %entry - ret i32 undef + ret void } ; second uniform store to the same address is conditional. ; we do not vectorize this. -define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 { +define void @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 { ; CHECK-LABEL: @multiple_uniform_stores_conditional( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0 @@ -520,7 +520,7 @@ define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocaptu ; CHECK: for.end10.loopexit: ; CHECK-NEXT: br label [[FOR_END10]] ; CHECK: for.end10: -; CHECK-NEXT: ret i32 undef +; CHECK-NEXT: ret void ; entry: %cmp20 = icmp eq i32 %itr, 0 @@ -567,7 +567,7 @@ for.inc8: ; preds = %for.body3, %for.con br i1 %exitcond26, label %for.end10, label %for.cond1.preheader for.end10: ; preds = %for.inc8, %entry - ret i32 undef + ret void } ; cannot vectorize loop with unsafe dependency between uniform load (%i10) and store diff --git a/llvm/test/Transforms/LoopVectorize/memdep.ll b/llvm/test/Transforms/LoopVectorize/memdep.ll index b891b43..d9d9eec 100644 --- a/llvm/test/Transforms/LoopVectorize/memdep.ll +++ b/llvm/test/Transforms/LoopVectorize/memdep.ll @@ -132,7 +132,7 @@ for.end: ; CHECK-LABEL: @f6 ; CHECK-NOT: <2 x i32> -define i32 @f6(ptr %a, i32 %tmp) { +define void @f6(ptr %a, i32 %tmp) { entry: br label %for.body @@ -149,7 +149,7 @@ for.body: br i1 %exitcond, label %for.body, label %for.end for.end: - ret i32 undef + ret void } ; Don't vectorize true loop carried dependencies that are not a multiple of the diff --git a/llvm/test/Transforms/LoopVectorize/operand-bundles.ll b/llvm/test/Transforms/LoopVectorize/operand-bundles.ll new file mode 100644 index 0000000..ce07364 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/operand-bundles.ll @@ -0,0 +1,227 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck %s + +define void @call_loop_invariant_operand_bundle(ptr %dst, {float, float} %sv) { +; CHECK-LABEL: define void @call_loop_invariant_operand_bundle( +; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "deopt"(float 1.000000e+01) ] +; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %a = extractvalue { float, float } %sv, 0 + %b = extractvalue { float, float } %sv, 1 + %addr = getelementptr float, ptr %dst, i32 %iv + %p = call float @llvm.pow.f32(float %a, float %b) [ "deopt"(float 10.0) ] + store float %p, ptr %addr + %iv.next = add nsw i32 %iv, 1 + %cond = icmp ne i32 %iv.next, 1000 + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +define void @call_unknown_operand_bundle(ptr %dst, {float, float} %sv) { +; CHECK-LABEL: define void @call_unknown_operand_bundle( +; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "unknown"(ptr null) ] +; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %a = extractvalue { float, float } %sv, 0 + %b = extractvalue { float, float } %sv, 1 + %addr = getelementptr float, ptr %dst, i32 %iv + %p = call float @llvm.pow.f32(float %a, float %b) [ "unknown"(ptr null) ] + store float %p, ptr %addr + %iv.next = add nsw i32 %iv, 1 + %cond = icmp ne i32 %iv.next, 1000 + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +define void @call_cold_operand_bundle(ptr %dst, {float, float} %sv) { +; CHECK-LABEL: define void @call_cold_operand_bundle( +; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "cold"() ] +; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %a = extractvalue { float, float } %sv, 0 + %b = extractvalue { float, float } %sv, 1 + %addr = getelementptr float, ptr %dst, i32 %iv + %p = call float @llvm.pow.f32(float %a, float %b) [ "cold"() ] + store float %p, ptr %addr + %iv.next = add nsw i32 %iv, 1 + %cond = icmp ne i32 %iv.next, 1000 + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +define void @assume_loop_variant_operand_bundle(ptr noalias %a, ptr noalias %b) { +; CHECK-LABEL: define void @assume_loop_variant_operand_bundle( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP8]], align 4 +; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP0]]) ] +; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP1]]) ] +; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP2]]) ] +; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP3]]) ] +; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP10]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600 +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds float, ptr %b, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %cmp1 = fcmp ogt float %0, 1.000000e+02 + tail call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 %iv) ] + %add = fadd float %0, 1.000000e+00 + %arrayidx5 = getelementptr inbounds float, ptr %a, i64 %iv + store float %add, ptr %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv, 1599 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @assume_cold_operand_bundle(ptr noalias %a, ptr noalias %b) { +; CHECK-LABEL: define void @assume_cold_operand_bundle( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "cold"() ] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600 +; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds float, ptr %b, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %cmp1 = fcmp ogt float %0, 1.000000e+02 + tail call void @llvm.assume(i1 true) [ "cold"() ] + %add = fadd float %0, 1.000000e+00 + %arrayidx5 = getelementptr inbounds float, ptr %a, i64 %iv + store float %add, ptr %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv, 1599 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll index d700d48..f5e480c 100644 --- a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll +++ b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll @@ -10,7 +10,7 @@ ; CHECK: store i64 %indvars.outer, ptr %O2, align 4 -define i64 @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) { +define void @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) { entry: %cmp = icmp sgt i64 %n, 0 br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer @@ -50,5 +50,5 @@ for.end.outer.loopexit: ; preds = %for.end.inner br label %for.end.outer for.end.outer: ; preds = %for.end.outer.loopexit, %entry - ret i64 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/pr28541.ll b/llvm/test/Transforms/LoopVectorize/pr28541.ll index ad7f6e7..0a9c8c1 100644 --- a/llvm/test/Transforms/LoopVectorize/pr28541.ll +++ b/llvm/test/Transforms/LoopVectorize/pr28541.ll @@ -28,7 +28,7 @@ ; CHECK-NOT: vectorized loop ; CHECK-LABEL: fn1 -define i32 @fn1() { +define void @fn1() { entry: %tmp2 = load i32, ptr @b, align 4 %dec3 = add nsw i32 %tmp2, -1 @@ -67,5 +67,5 @@ while.cond.while.end_crit_edge: ; preds = %while.cond br label %while.end while.end: ; preds = %while.cond.while.end_crit_edge, %entry - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll index f87be5a..6ea227f 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll @@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; a[i] = b[i] * 3; ; } -define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp { +define void @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64, !dbg [[DBG4:![0-9]+]] @@ -58,7 +58,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]], !dbg [[DBG14:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: ret i32 undef, !dbg [[DBG14]] +; CHECK-NEXT: ret void, !dbg [[DBG14]] ; ; FORCED_OPTSIZE-LABEL: @foo( ; FORCED_OPTSIZE-NEXT: entry: @@ -80,7 +80,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp ; FORCED_OPTSIZE: for.end.loopexit: ; FORCED_OPTSIZE-NEXT: br label [[FOR_END]], !dbg [[DBG10:![0-9]+]] ; FORCED_OPTSIZE: for.end: -; FORCED_OPTSIZE-NEXT: ret i32 undef, !dbg [[DBG10]] +; FORCED_OPTSIZE-NEXT: ret void, !dbg [[DBG10]] ; entry: %cmp6 = icmp sgt i32 %n, 0, !dbg !6 @@ -99,7 +99,7 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond, label %for.end, label %for.body, !dbg !7 for.end: ; preds = %for.body, %entry - ret i32 undef, !dbg !8 + ret void, !dbg !8 } ; Make sure that we try to vectorize loops with a runtime check if the @@ -505,11 +505,11 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], 6 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[OFFSET_IDX]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[OUT]], i64 [[TMP6]] -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META37:![0-9]+]], !noalias [[META40:![0-9]+]] -; CHECK-NEXT: store i32 0, ptr [[IN]], align 4, !alias.scope [[META40]] +; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META36:![0-9]+]], !noalias [[META39:![0-9]+]] +; CHECK-NEXT: store i32 0, ptr [[IN]], align 4, !alias.scope [[META39]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -524,7 +524,7 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: store i32 0, ptr [[IN]], align 4 ; CHECK-NEXT: [[CMP7_NOT:%.*]] = icmp sgt i32 [[LEN]], [[IV_NEXT]] -; CHECK-NEXT: br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP42:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll index ad8cd42..667df3a 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll @@ -88,11 +88,11 @@ define void @test2(ptr %a, ptr noalias %b) { ; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP7]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]] +; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1 @@ -101,8 +101,6 @@ define void @test2(ptr %a, ptr noalias %b) { ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x float>, ptr [[TMP13]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00) ; CHECK-NEXT: [[TMP15:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD3]], splat (float 1.000000e+00) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/write-only.ll b/llvm/test/Transforms/LoopVectorize/write-only.ll index cc21b94..8df71e83 100644 --- a/llvm/test/Transforms/LoopVectorize/write-only.ll +++ b/llvm/test/Transforms/LoopVectorize/write-only.ll @@ -4,8 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ;CHECK-LABEL: @read_mod_write_single_ptr( ;CHECK: load <4 x float> -;CHECK: ret i32 -define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp { +;CHECK: ret void +define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -21,14 +21,14 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } ; Ensure that volatile stores are not vectorized. ; CHECK-LABEL: @read_mod_write_single_ptr_volatile_store( ; CHECK-NOT: store <4 x float> -; CHECK: ret i32 -define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp { +; CHECK: ret void +define void @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -44,5 +44,5 @@ define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) n br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll index f1ffcc7..239397b 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll @@ -17,7 +17,7 @@ define void @_Z11hotFunctionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 { ; CHECK-LABEL: define void @_Z11hotFunctionbiiPiS_S_ -; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) !prof [[PROF16:![0-9]+]] { +; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {{.*}}{ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP19_NOT:%.*]] = icmp eq i32 [[M]], 0 ; CHECK-NEXT: br i1 [[CMP19_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], !prof [[PROF17:![0-9]+]] |