diff options
Diffstat (limited to 'llvm/test/CodeGen')
59 files changed, 9164 insertions, 2019 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir index cc75774..c2bf95c 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir @@ -15,8 +15,9 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C1]](s64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[C1]](s64) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C1]], [[C2]] + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY2]], [[C2]] ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s64) ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD]](p0) :: (store (s32)) ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C1]](s64) @@ -91,7 +92,8 @@ body: | ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32)) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C3]](s64) - ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[C3]], [[C2]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[C3]](s64) + ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[COPY3]], [[C2]] ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL1]](s64) ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD1]](p0) :: (store (s32)) ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C3]](s64) diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll index 649d0a9..e7e9ee7 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll @@ -1,41 +1,54 @@ -; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: smmla.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smmla v0.4s, v1.16b, v2.16b +; CHECK-NEXT: ret entry: -; CHECK-LABEL: smmla.v4i32.v16i8 -; CHECK: smmla v0.4s, v1.16b, v2.16b %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) ret <4 x i32> %vmmla1.i } define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: ummla.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ummla v0.4s, v1.16b, v2.16b +; CHECK-NEXT: ret entry: -; CHECK-LABEL: ummla.v4i32.v16i8 -; CHECK: ummla v0.4s, v1.16b, v2.16b %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) ret <4 x i32> %vmmla1.i } define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: usmmla.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: usmmla v0.4s, v1.16b, v2.16b +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usmmla.v4i32.v16i8 -; CHECK: usmmla v0.4s, v1.16b, v2.16b %vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3 ret <4 x i32> %vusmmla1.i } define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: usdot.v2i32.v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: usdot v0.2s, v1.8b, v2.8b +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usdot.v2i32.v8i8 -; CHECK: usdot v0.2s, v1.8b, v2.8b %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) ret <2 x i32> %vusdot1.i } define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: usdot_lane.v2i32.v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usdot_lane.v2i32.v8i8 -; CHECK: usdot v0.2s, v1.8b, v2.4b[0] %0 = bitcast <8 x i8> %b to <2 x i32> %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer %1 = bitcast <2 x i32> %shuffle to <8 x i8> @@ -44,9 +57,12 @@ entry: } define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: sudot_lane.v2i32.v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: sudot_lane.v2i32.v8i8 -; CHECK: sudot v0.2s, v1.8b, v2.4b[0] %0 = bitcast <8 x i8> %b to <2 x i32> %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer %1 = bitcast <2 x i32> %shuffle to <8 x i8> @@ -55,9 +71,11 @@ entry: } define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: usdot_lane.v2i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usdot_lane.v2i32.v16i8 -; CHECK: usdot v0.2s, v1.8b, v2.4b[0] %0 = bitcast <16 x i8> %b to <4 x i32> %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer %1 = bitcast <2 x i32> %shuffle to <8 x i8> @@ -66,9 +84,11 @@ entry: } define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: sudot_lane.v2i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: sudot_lane.v2i32.v16i8 -; CHECK: sudot v0.2s, v1.8b, v2.4b[0] %0 = bitcast <16 x i8> %b to <4 x i32> %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer %1 = bitcast <2 x i32> %shuffle to <8 x i8> @@ -77,17 +97,22 @@ entry: } define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: usdot.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: usdot v0.4s, v1.16b, v2.16b +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usdot.v4i32.v16i8 -; CHECK: usdot v0.4s, v1.16b, v2.16b %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3 ret <4 x i32> %vusdot1.i } define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: usdot_lane.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usdot_lane.v4i32.v16i8 -; CHECK: usdot v0.4s, v1.16b, v2.4b[0] %0 = bitcast <8 x i8> %b to <2 x i32> %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer %1 = bitcast <4 x i32> %shuffle to <16 x i8> @@ -96,9 +121,12 @@ entry: } define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: sudot_lane.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: sudot_lane.v4i32.v16i8 -; CHECK: sudot v0.4s, v1.16b, v2.4b[0] %0 = bitcast <8 x i8> %b to <2 x i32> %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer %1 = bitcast <4 x i32> %shuffle to <16 x i8> @@ -107,9 +135,11 @@ entry: } define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: usdot_laneq.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usdot_laneq.v4i32.v16i8 -; CHECK: usdot v0.4s, v1.16b, v2.4b[0] %0 = bitcast <16 x i8> %b to <4 x i32> %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer %1 = bitcast <4 x i32> %shuffle to <16 x i8> @@ -118,9 +148,11 @@ entry: } define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: sudot_laneq.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: sudot_laneq.v4i32.v16i8 -; CHECK: sudot v0.4s, v1.16b, v2.4b[0] %0 = bitcast <16 x i8> %b to <4 x i32> %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer %1 = bitcast <4 x i32> %shuffle to <16 x i8> @@ -133,4 +165,3 @@ declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2 declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 - diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll index a0f1b71..bb362d2 100644 --- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr128 | FileCheck %s -check-prefixes=NOZCM-FPR128-ATTR --match-full-lines ; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr128 | FileCheck %s -check-prefixes=ZCM-FPR128-ATTR --match-full-lines -define void @zero_cycle_regmov_FPR64(double %a, double %b, double %c, double %d) { +define void @zero_cycle_regmove_FPR64(double %a, double %b, double %c, double %d) { entry: ; CHECK-LABEL: t: ; NOZCM-FPR128-CPU: fmov d0, d2 @@ -45,7 +45,7 @@ entry: declare float @foo_double(double, double) -define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) { +define void @zero_cycle_regmove_FPR32(float %a, float %b, float %c, float %d) { entry: ; CHECK-LABEL: t: ; NOZCM-FPR128-CPU: fmov s0, s2 @@ -86,7 +86,7 @@ entry: declare float @foo_float(float, float) -define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) { +define void @zero_cycle_regmove_FPR16(half %a, half %b, half %c, half %d) { entry: ; CHECK-LABEL: t: ; NOZCM-FPR128-CPU: fmov s0, s2 diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll index e14e69b..d6d3f15 100644 --- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-gpr64 | FileCheck %s -check-prefixes=NOTATTR --match-full-lines ; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-gpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines -define void @zero_cycle_regmov_GPR32(i32 %a, i32 %b, i32 %c, i32 %d) { +define void @zero_cycle_regmove_GPR32(i32 %a, i32 %b, i32 %c, i32 %d) { entry: ; CHECK-LABEL: t: ; NOTCPU-LINUX: mov w0, w2 diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll index 15ee6a0..36655f6 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll @@ -359,12 +359,177 @@ define float @fadd_reduct_reassoc_v4v8f32(<vscale x 4 x float> %a, <vscale x 8 x ret float %r } +; No FMULV instruction so use knowledge about the architectural maximum size of +; an SVE register to "scalarise" the reduction. + +define half @fmulv_nxv2f16(half %init, <vscale x 2 x half> %a) { +; CHECK-LABEL: fmulv_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov z2.h, #1.00000000 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: fmul h0, h0, h1 +; CHECK-NEXT: ret + %res = call fast half @llvm.vector.reduce.fmul.nxv2f16(half %init, <vscale x 2 x half> %a) + ret half %res +} + +define half @fmulv_nxv4f16(half %init, <vscale x 4 x half> %a) { +; CHECK-LABEL: fmulv_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov z2.h, #1.00000000 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: fmul h0, h0, h1 +; CHECK-NEXT: ret + %res = call fast half @llvm.vector.reduce.fmul.nxv4f16(half %init, <vscale x 4 x half> %a) + ret half %res +} + +define half @fmulv_nxv8f16(half %init, <vscale x 8 x half> %a) { +; CHECK-LABEL: fmulv_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov z2.h, #1.00000000 +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z3.h +; CHECK-NEXT: fmul h0, h0, h1 +; CHECK-NEXT: ret + %res = call fast half @llvm.vector.reduce.fmul.nxv8f16(half %init, <vscale x 8 x half> %a) + ret half %res +} + +define float @fmulv_nxv2f32(float %init, <vscale x 2 x float> %a) { +; CHECK-LABEL: fmulv_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov z2.s, #1.00000000 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: fmul s0, s0, s1 +; CHECK-NEXT: ret + %res = call fast float @llvm.vector.reduce.fmul.nxv2f32(float %init, <vscale x 2 x float> %a) + ret float %res +} + +define float @fmulv_nxv4f32(float %init, <vscale x 4 x float> %a) { +; CHECK-LABEL: fmulv_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov z2.s, #1.00000000 +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z3.s +; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z3.s +; CHECK-NEXT: fmul s0, s0, s1 +; CHECK-NEXT: ret + %res = call fast float @llvm.vector.reduce.fmul.nxv4f32(float %init, <vscale x 4 x float> %a) + ret float %res +} + +define double @fmulv_nxv2f64(double %init, <vscale x 2 x double> %a) { +; CHECK-LABEL: fmulv_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov z2.d, #1.00000000 +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z3.d +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z3.d +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z3.d +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z3.d +; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z3.d +; CHECK-NEXT: fmul d0, d0, d1 +; CHECK-NEXT: ret + %res = call fast double @llvm.vector.reduce.fmul.nxv2f64(double %init, <vscale x 2 x double> %a) + ret double %res +} + declare half @llvm.vector.reduce.fadd.nxv2f16(half, <vscale x 2 x half>) declare half @llvm.vector.reduce.fadd.nxv4f16(half, <vscale x 4 x half>) declare half @llvm.vector.reduce.fadd.nxv8f16(half, <vscale x 8 x half>) -declare half @llvm.vector.reduce.fadd.nxv6f16(half, <vscale x 6 x half>) -declare half @llvm.vector.reduce.fadd.nxv10f16(half, <vscale x 10 x half>) -declare half @llvm.vector.reduce.fadd.nxv12f16(half, <vscale x 12 x half>) declare float @llvm.vector.reduce.fadd.nxv2f32(float, <vscale x 2 x float>) declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>) declare float @llvm.vector.reduce.fadd.nxv8f32(float, <vscale x 8 x float>) @@ -397,3 +562,10 @@ declare half @llvm.vector.reduce.fminimum.nxv8f16(<vscale x 8 x half>) declare float @llvm.vector.reduce.fminimum.nxv2f32(<vscale x 2 x float>) declare float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float>) declare double @llvm.vector.reduce.fminimum.nxv2f64(<vscale x 2 x double>) + +declare half @llvm.vector.reduce.fmul.nxv2f16(half, <vscale x 2 x half>) +declare half @llvm.vector.reduce.fmul.nxv4f16(half, <vscale x 4 x half>) +declare half @llvm.vector.reduce.fmul.nxv8f16(half, <vscale x 8 x half>) +declare float @llvm.vector.reduce.fmul.nxv2f32(float, <vscale x 2 x float>) +declare float @llvm.vector.reduce.fmul.nxv4f32(float, <vscale x 4 x float>) +declare double @llvm.vector.reduce.fmul.nxv2f64(double, <vscale x 2 x double>) diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll index be936f0..6fb0315 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll @@ -369,6 +369,131 @@ define i64 @smax_nxv2i64(<vscale x 2 x i64> %a) { ret i64 %res } +; No MULV instruction so use knowledge about the architectural maximum size of +; an SVE register to "scalarise" the reduction. + +define i8 @mulv_nxv16i8(<vscale x 16 x i8> %a) { +; CHECK-LABEL: mulv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.b, #1 // =0x1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.mul.nxv16i8(<vscale x 16 x i8> %a) + ret i8 %res +} + +define i16 @mulv_nxv8i16(<vscale x 8 x i16> %a) { +; CHECK-LABEL: mulv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, #1 // =0x1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.mul.nxv8i16(<vscale x 8 x i16> %a) + ret i16 %res +} + +define i32 @mulv_nxv4i32(<vscale x 4 x i32> %a) { +; CHECK-LABEL: mulv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %a) + ret i32 %res +} + +define i64 @mulv_nxv2i64(<vscale x 2 x i64> %a) { +; CHECK-LABEL: mulv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.mul.nxv2i64(<vscale x 2 x i64> %a) + ret i64 %res +} + ; Test widen vector reduce type declare i8 @llvm.vector.reduce.smin.nxv10i8(<vscale x 10 x i8>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll index 26b9d99..8705647 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll @@ -206,7 +206,7 @@ define <2 x half> @test_max_K0min_K1Val_v2f16(<2 x half> %a) #1 { ; global nnan function attribute always forces clamp combine -define float @test_min_max_global_nnan(float %a) #3 { +define float @test_min_max_global_nnan(float %a) { ; GFX10-LABEL: test_min_max_global_nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -223,11 +223,11 @@ define float @test_min_max_global_nnan(float %a) #3 { ; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call float @llvm.maxnum.f32(float %a, float 0.0) - %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0) + %fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 1.0) ret float %fmed } -define float @test_max_min_global_nnan(float %a) #3 { +define float @test_max_min_global_nnan(float %a) { ; GFX10-LABEL: test_max_min_global_nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -244,7 +244,7 @@ define float @test_max_min_global_nnan(float %a) #3 { ; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call float @llvm.minnum.f32(float %a, float 1.0) - %fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0) + %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 0.0) ret float %fmed } @@ -414,5 +414,4 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) attributes #0 = {"amdgpu-ieee"="true"} attributes #1 = {"amdgpu-ieee"="false"} attributes #2 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="true"} -attributes #3 = {"no-nans-fp-math"="true"} attributes #4 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="false"} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll index d2c93e7..696a87b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll @@ -232,7 +232,7 @@ define half @test_max_K0min_K1Val_f16(half %a) #1 { ; global nnan function attribute always forces fmed3 combine -define float @test_min_max_global_nnan(float %a) #2 { +define float @test_min_max_global_nnan(float %a) { ; GFX10-LABEL: test_min_max_global_nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -254,12 +254,12 @@ define float @test_min_max_global_nnan(float %a) #2 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0) + %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0) %fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0) ret float %fmed } -define float @test_max_min_global_nnan(float %a) #2 { +define float @test_max_min_global_nnan(float %a) { ; GFX10-LABEL: test_max_min_global_nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -281,8 +281,8 @@ define float @test_max_min_global_nnan(float %a) #2 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %minnum = call float @llvm.minnum.f32(float %a, float 4.0) - %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0) + %minnum = call nnan float @llvm.minnum.f32(float %a, float 4.0) + %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 2.0) ret float %fmed } @@ -560,4 +560,3 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) attributes #0 = {"amdgpu-ieee"="true"} attributes #1 = {"amdgpu-ieee"="false"} -attributes #2 = {"no-nans-fp-math"="true"} diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 9e15225..3145a27 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -10,7 +10,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s -define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -120,7 +120,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ret void } -define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -231,7 +231,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ret void } -define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -342,7 +342,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -453,7 +453,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -569,7 +569,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ret void } -define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -740,7 +740,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ret void } -define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f64: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -955,14 +955,14 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid %a = load float, ptr addrspace(1) %gep0 - %max = call float @llvm.maxnum.f32(float %a, float 2.0) - %med = call float @llvm.minnum.f32(float %max, float 4.0) + %max = call nnan float @llvm.maxnum.f32(float %a, float 2.0) + %med = call nnan float @llvm.minnum.f32(float %max, float 4.0) store float %med, ptr addrspace(1) %outgep ret void } -define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -1297,10 +1297,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 %a.fneg = fsub float -0.0, %a - %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -1487,10 +1487,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 %b.fneg = fsub float -0.0, %b - %tmp0 = call float @llvm.minnum.f32(float %a, float %b.fneg) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b.fneg) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b.fneg) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b.fneg) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -1677,10 +1677,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 %c.fneg = fsub float -0.0, %c - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fneg) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fneg) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -1872,14 +1872,14 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs %c = load volatile float, ptr addrspace(1) %gep2 %a.fneg = fsub float -0.0, %a - %b.fabs = call float @llvm.fabs.f32(float %b) - %c.fabs = call float @llvm.fabs.f32(float %c) + %b.fabs = call nnan float @llvm.fabs.f32(float %b) + %c.fabs = call nnan float @llvm.fabs.f32(float %c) %c.fabs.fneg = fsub float -0.0, %c.fabs - %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs) - %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b.fabs) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b.fabs) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void @@ -2082,16 +2082,16 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs %c.fabs = call float @llvm.fabs.f32(float %c) %c.fabs.fneg = fsub float -0.0, %c.fabs - %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) - %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } -define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ret void } -define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -2418,7 +2418,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ret void } -define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_call_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -2570,7 +2570,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ret void } -define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_fast_call_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -2878,10 +2878,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3030,10 +3030,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3220,10 +3220,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 %a.fneg = fsub float -0.0, %a - %tmp0 = call float @llvm.maxnum.f32(float %a.fneg, float %b) - %tmp1 = call float @llvm.minnum.f32(float %a.fneg, float %b) - %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b) + %tmp1 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b) + %tmp2 = call nnan float @llvm.maxnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.minnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3372,10 +3372,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3524,10 +3524,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3676,10 +3676,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3828,10 +3828,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -3980,10 +3980,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -4132,10 +4132,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -4284,10 +4284,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -4436,10 +4436,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -4588,10 +4588,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -4740,10 +4740,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -4892,10 +4892,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -5044,10 +5044,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -5196,10 +5196,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -5348,10 +5348,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %b, float %a) - %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) - %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) - %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0) store float %med3, ptr addrspace(1) %outgep ret void } @@ -5503,10 +5503,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp1 = call float @llvm.minnum.f32(float %a, float %b) - %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.maxnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.minnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -5515,7 +5515,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; Negative patterns ; --------------------------------------------------------------------- -define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -5717,7 +5717,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -5944,7 +5944,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -6146,7 +6146,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -6352,7 +6352,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ret void } -define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -6527,7 +6527,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -6702,7 +6702,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -6877,7 +6877,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ret void } -define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -7270,10 +7270,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 %a.fneg = fsub float -0.0, %a - %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %med3, ptr addrspace(1) %outgep ret void } @@ -7428,13 +7428,13 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out %a = load volatile float, ptr addrspace(1) %gep0 %b = load volatile float, ptr addrspace(1) %gep1 %c = load volatile float, ptr addrspace(1) %gep2 - %max = call float @llvm.maxnum.f32(float %a, float %b) - %minmax = call float @llvm.minnum.f32(float %max, float %c) + %max = call nnan float @llvm.maxnum.f32(float %a, float %b) + %minmax = call nnan float @llvm.minnum.f32(float %max, float %c) store float %minmax, ptr addrspace(1) %outgep ret void } -define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -7597,7 +7597,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ret void } -define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { ; SI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -7865,7 +7865,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ret void } -define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: two_non_inline_constant: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -7998,7 +7998,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad } ; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants. -define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: one_non_inline_constant: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -8137,7 +8137,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ret void } -define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-SDAG-LABEL: two_non_inline_constant_multi_use: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -8343,7 +8343,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ret void } -define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 { +define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) { ; SI-LABEL: v_test_fmed3_r_i_i_f32_minimumnum_maximumnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8384,7 +8384,7 @@ define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 { ret float %med } -define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %a) #1 { +define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %a) { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8452,7 +8452,7 @@ define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> % ret <2 x float> %med } -define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(float %a) #1 { +define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(float %a) { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8525,7 +8525,7 @@ define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use( ret { float, float } %ins.1 } -define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 { +define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) { ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8567,7 +8567,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) ret float %med } -define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) #1 { +define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) { ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8609,7 +8609,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) ret float %med } -define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) #1 { +define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) { ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8651,7 +8651,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) #1 { ret float %med } -define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) #1 { +define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) { ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8693,7 +8693,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) #1 { ret float %med } -define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 { +define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8772,7 +8772,7 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 { ret half %med } -define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) #1 { +define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8848,7 +8848,7 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) ret <2 x half> %med } -define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) #1 { +define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) { ; SI-LABEL: v_test_fmed3_r_i_i_f64_minimumnum_maximumnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8905,5 +8905,4 @@ declare half @llvm.minnum.f16(half, half) #0 declare half @llvm.maxnum.f16(half, half) #0 attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index 6b09424..eee232a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -49,7 +49,6 @@ bb: ret void } -; FIXME: This generates "instid1(/* invalid instid value */)". define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) { ; GFX11-LABEL: f2: ; GFX11: ; %bb.0: ; %bb diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index b07dec3..689d147 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -6,1153 +6,1147 @@ define amdgpu_kernel void @largeInterleave() #0 { ret void } ; GCN-LABEL: largeInterleave: ; GCN: ; %bb.0: - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: ; implicit-def: $vgpr0 - ; GCN-NEXT: ; implicit-def: $vgpr2 - ; GCN-NEXT: ; implicit-def: $vgpr1 - ; GCN-NEXT: ; implicit-def: $vgpr8 - ; GCN-NEXT: ; implicit-def: $vgpr94 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ; implicit-def: $vgpr106 - ; GCN-NEXT: ; implicit-def: $vgpr132 - ; GCN-NEXT: ; implicit-def: $vgpr133 - ; GCN-NEXT: ; implicit-def: $vgpr139 - ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 - ; GCN-NEXT: ; iglp_opt mask(0x00000002) - ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: ; implicit-def: $vgpr16 + ; GCN-NEXT: ; implicit-def: $vgpr25 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) - ; GCN-NEXT: v_readfirstlane_b32 s7, v0 + ; GCN-NEXT: v_readfirstlane_b32 s17, v16 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr17 + ; GCN-NEXT: ; implicit-def: $sgpr15 ; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: ; implicit-def: $sgpr5 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_lshl_add_u32 v0, s7, 4, v2 - ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 - ; GCN-NEXT: v_add_lshl_u32 v92, v0, v1, 1 - ; GCN-NEXT: v_add_u32_e32 v93, s0, v92 - ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: s_lshl_b32 s18, s17, 7 + ; GCN-NEXT: ; implicit-def: $vgpr18 + ; GCN-NEXT: v_add_lshl_u32 v230, v18, s18, 1 + ; GCN-NEXT: v_lshl_add_u32 v25, s17, 4, v25 + ; GCN-NEXT: v_mul_lo_u32 v25, v25, s6 + ; GCN-NEXT: v_add_lshl_u32 v226, v25, v17, 1 + ; GCN-NEXT: v_add_u32_e32 v17, s15, v226 + ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v17, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: s_lshl_b32 s0, s7, 7 - ; GCN-NEXT: v_add_lshl_u32 v95, v8, s0, 1 - ; GCN-NEXT: v_add_u32_e32 v8, 64, v93 - ; GCN-NEXT: ; kill: killed $vgpr8 + ; GCN-NEXT: v_add_u32_e32 v72, 64, v17 + ; GCN-NEXT: ; implicit-def: $vgpr213 + ; GCN-NEXT: ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155 + ; GCN-NEXT: ; implicit-def: $vgpr246 + ; GCN-NEXT: v_add_u32_e32 v188, 0x80, v17 + ; GCN-NEXT: ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159 + ; GCN-NEXT: ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147 + ; GCN-NEXT: ; implicit-def: $vgpr19 + ; GCN-NEXT: ; implicit-def: $vgpr26 + ; GCN-NEXT: ; implicit-def: $vgpr27 + ; GCN-NEXT: v_add_u32_e32 v227, 0xc0, v17 + ; GCN-NEXT: v_add_u32_e32 v231, v19, v26 + ; GCN-NEXT: v_add_u32_e32 v232, v19, v27 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; kill: killed $vgpr92 - ; GCN-NEXT: ; implicit-def: $sgpr6 + ; GCN-NEXT: ; implicit-def: $vgpr28 + ; GCN-NEXT: ; implicit-def: $vgpr29 + ; GCN-NEXT: v_add_u32_e32 v233, v19, v28 + ; GCN-NEXT: v_add_u32_e32 v234, v19, v29 + ; GCN-NEXT: ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: ; implicit-def: $sgpr7 + ; GCN-NEXT: ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151 + ; GCN-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139 + ; GCN-NEXT: ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135 + ; GCN-NEXT: ; implicit-def: $vgpr20 + ; GCN-NEXT: v_add_u32_e32 v18, s17, v20 + ; GCN-NEXT: v_and_b32_e32 v18, 0x1fffffff, v18 + ; GCN-NEXT: ; implicit-def: $sgpr16 + ; GCN-NEXT: v_mul_lo_u32 v18, v18, s16 + ; GCN-NEXT: ; implicit-def: $vgpr21 + ; GCN-NEXT: v_add_lshl_u32 v199, v21, v18, 1 + ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: v_lshl_add_u32 v200, v22, 1, v199 + ; GCN-NEXT: ; implicit-def: $vgpr23 + ; GCN-NEXT: v_lshl_add_u32 v201, v23, 1, v200 + ; GCN-NEXT: ; implicit-def: $vgpr24 + ; GCN-NEXT: v_lshl_add_u32 v202, v24, 1, v201 + ; GCN-NEXT: ; implicit-def: $vgpr16 + ; GCN-NEXT: ; implicit-def: $vgpr18 + ; GCN-NEXT: ; implicit-def: $vgpr20 + ; GCN-NEXT: ; implicit-def: $vgpr24 + ; GCN-NEXT: v_add_u32_e32 v247, v19, v24 + ; GCN-NEXT: v_add_u32_e32 v248, v19, v16 + ; GCN-NEXT: v_add_u32_e32 v249, v19, v18 + ; GCN-NEXT: v_add_u32_e32 v250, v19, v20 + ; GCN-NEXT: ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131 + ; GCN-NEXT: ; implicit-def: $sgpr14 + ; GCN-NEXT: ; implicit-def: $vgpr196 + ; GCN-NEXT: ; implicit-def: $sgpr12_sgpr13 + ; GCN-NEXT: ; implicit-def: $vgpr211 + ; GCN-NEXT: v_max_f32_e32 v212, v211, v211 + ; GCN-NEXT: ; implicit-def: $vgpr198 + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr32 + ; GCN-NEXT: ; implicit-def: $vgpr33 + ; GCN-NEXT: ; implicit-def: $vgpr34 + ; GCN-NEXT: v_add_u32_e32 v210, v19, v34 + ; GCN-NEXT: v_add_u32_e32 v206, v19, v33 + ; GCN-NEXT: v_add_u32_e32 v205, v19, v32 + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN-NEXT: ; implicit-def: $vgpr21 + ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: ; implicit-def: $vgpr23 + ; GCN-NEXT: ; implicit-def: $vgpr30 + ; GCN-NEXT: ; implicit-def: $vgpr31 + ; GCN-NEXT: v_add_u32_e32 v207, v19, v21 + ; GCN-NEXT: v_add_u32_e32 v208, v19, v22 + ; GCN-NEXT: v_add_u32_e32 v209, v19, v23 + ; GCN-NEXT: v_add_u32_e32 v203, v19, v30 + ; GCN-NEXT: v_add_u32_e32 v204, v19, v31 + ; GCN-NEXT: ; kill: killed $vgpr17 + ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; GCN-NEXT: ; implicit-def: $vgpr197 + ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[0:3] + ; GCN-NEXT: ds_write_b128 v230, v[64:67] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[4:7] offset:1024 + ; GCN-NEXT: ds_write_b128 v230, v[68:71] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[64:67], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127] + ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111] + ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0 - ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[168:171], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: ds_read_b128 v[172:175], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[84:87], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[180:183], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:1536 + ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_add_u32_e32 v72, 0x80, v93 + ; GCN-NEXT: ds_write_b128 v230, v[160:163] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v230, v[164:167] offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:128 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v188, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[188:191], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: ds_read_b128 v[192:195], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[164:167], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[214:217], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127] + ; GCN-NEXT: ds_read_b128 v[218:221], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[222:225], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[168:171], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[188:189], v[144:145], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[190:191], v[146:147], v[112:127] + ; GCN-NEXT: ds_read_b128 v[188:191], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: ds_write_b128 v230, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 - ; GCN-NEXT: ; implicit-def: $vgpr64 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 - ; GCN-NEXT: ; implicit-def: $vgpr73 - ; GCN-NEXT: v_add_u32_e32 v76, v132, v64 + ; GCN-NEXT: ds_write_b128 v230, v[160:163] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79] + ; GCN-NEXT: buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v73 - ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr74 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v74 - ; GCN-NEXT: ; implicit-def: $vgpr75 - ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v75 - ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79] + ; GCN-NEXT: v_perm_b32 v238, v162, v160, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127] + ; GCN-NEXT: v_perm_b32 v240, v162, v160, s7 + ; GCN-NEXT: v_perm_b32 v242, v163, v161, s5 + ; GCN-NEXT: v_perm_b32 v244, v163, v161, s7 + ; GCN-NEXT: ds_read_b128 v[160:163], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr76 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ; implicit-def: $sgpr8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512 + ; GCN-NEXT: v_perm_b32 v239, v174, v172, s5 + ; GCN-NEXT: v_perm_b32 v241, v174, v172, s7 + ; GCN-NEXT: v_perm_b32 v243, v175, v173, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79] + ; GCN-NEXT: v_perm_b32 v245, v175, v173, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127] + ; GCN-NEXT: ds_read_b128 v[218:221], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[172:175], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[146:147], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[178:179], v[158:159], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[148:149], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[140:141], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[192:193], v[144:145], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[150:151], v[112:127] + ; GCN-NEXT: ds_read_b128 v[160:163], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: ds_read_b128 v[184:187], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[214:217], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[142:143], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[146:147], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[148:149], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[156:157], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[140:141], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[150:151], v[64:79] + ; GCN-NEXT: ds_read_b128 v[160:163], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: ds_write_b128 v230, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v230, v[226:229] offset:1024 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[158:159], v[80:95] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[64:67], v94 + ; GCN-NEXT: ds_read_b128 v[156:159], v213 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[90:93], v94 offset:512 + ; GCN-NEXT: ds_read_b128 v[226:229], v213 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71 - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[180:183], v213 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ds_read_b128 v[76:79], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[152:155], v213 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[94:97], v106 + ; GCN-NEXT: ds_read_b128 v[230:233], v246 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63] - ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47] - ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[234:237], v246 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127] + ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[224:225], v[142:143], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[148:149], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127] + ; GCN-NEXT: ds_read_b128 v[156:159], v246 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63] - ; GCN-NEXT: v_perm_b32 v94, v102, v98, s5 - ; GCN-NEXT: v_perm_b32 v98, v102, v98, s8 - ; GCN-NEXT: v_perm_b32 v102, v103, v99, s5 - ; GCN-NEXT: v_perm_b32 v95, v104, v100, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63] - ; GCN-NEXT: v_perm_b32 v96, v103, v99, s8 - ; GCN-NEXT: v_perm_b32 v99, v104, v100, s8 - ; GCN-NEXT: v_perm_b32 v103, v105, v101, s5 - ; GCN-NEXT: v_perm_b32 v97, v105, v101, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47] - ; GCN-NEXT: s_nop 5 - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v48 - ; GCN-NEXT: v_mul_f32_e32 v101, s4, v49 - ; GCN-NEXT: v_max3_f32 v92, v100, s6, v101 - ; GCN-NEXT: v_mul_f32_e32 v93, s4, v50 - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v51 - ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 - ; GCN-NEXT: v_mul_f32_e32 v93, s4, v52 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31] - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v53 - ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v54 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v55 - ; GCN-NEXT: v_max3_f32 v84, v92, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v56 - ; GCN-NEXT: v_mul_f32_e32 v92, s4, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15] - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v92 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v58 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v59 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v60 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v61 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v62 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v63 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31] - ; GCN-NEXT: s_nop 6 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v32 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v33 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v34 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v35 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v37 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v38 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v39 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v40 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v41 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31] - ; GCN-NEXT: v_max3_f32 v80, v84, v85, v80 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v42 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v43 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v44 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v45 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v46 - ; GCN-NEXT: v_mul_f32_e32 v82, s4, v47 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v16 - ; GCN-NEXT: v_mul_f32_e32 v82, s4, v17 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 - ; GCN-NEXT: v_max3_f32 v68, v80, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v20 - ; GCN-NEXT: v_mul_f32_e32 v76, s4, v21 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v76 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v22 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v23 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v24 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v25 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v26 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v27 - ; GCN-NEXT: v_max3_f32 v64, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v28 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v29 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v30 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v31 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v0 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v2 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v3 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v4 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v5 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v7 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v8 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v10 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v11 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v12 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v13 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v14 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v15 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: ; implicit-def: $vgpr65 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $vgpr68 - ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_add_u32_e32 v65, s7, v65 - ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65 - ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6 - ; GCN-NEXT: v_add_lshl_u32 v135, v66, v65, 1 - ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_lshl_add_u32 v136, v66, 1, v135 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_lshl_add_u32 v137, v66, 1, v136 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 - ; GCN-NEXT: v_lshl_add_u32 v138, v66, 1, v137 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v135, v[94:95] - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v64, v64, v65 - ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64 + ; GCN-NEXT: ds_write_b64 v199, v[238:239] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[98:99] + ; GCN-NEXT: ds_write_b64 v200, v[240:241] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[102:103] + ; GCN-NEXT: ds_write_b64 v201, v[242:243] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[96:97] - ; GCN-NEXT: v_add_u32_e32 v68, v132, v68 - ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7] - ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 - ; GCN-NEXT: ; implicit-def: $vgpr65 - ; GCN-NEXT: v_max_f32_e32 v66, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v134, v66, v64 - ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: ds_write_b64 v202, v[244:245] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v64 - ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[150:151], v[96:111] + ; GCN-NEXT: buffer_load_dwordx2 v[194:195], v248, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v66 - ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[218:219], v249, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v67 - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[220:221], v250, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 - ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134 - ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 - ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134 - ; GCN-NEXT: v_exp_f32_e32 v163, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 - ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134 - ; GCN-NEXT: v_exp_f32_e32 v164, v57 - ; GCN-NEXT: v_exp_f32_e32 v49, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134 - ; GCN-NEXT: v_exp_f32_e32 v50, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v66 - ; GCN-NEXT: v_fma_f32 v68, s4, v52, -v134 - ; GCN-NEXT: v_exp_f32_e32 v51, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_fma_f32 v69, s4, v53, -v134 - ; GCN-NEXT: v_exp_f32_e32 v52, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v68 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_fma_f32 v70, s4, v54, -v134 - ; GCN-NEXT: v_exp_f32_e32 v53, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v69 - ; GCN-NEXT: v_fma_f32 v71, s4, v55, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v54, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70 - ; GCN-NEXT: v_exp_f32_e32 v55, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71 - ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134 - ; GCN-NEXT: v_exp_f32_e32 v56, v48 - ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49 - ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50 - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51 - ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 - ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v48, v48 - ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58 - ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67 - ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66 - ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55 - ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56 - ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 - ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 - ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] - ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48 - ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48 - ; GCN-NEXT: v_mul_f32_e64 v84, v84, v48 - ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48 - ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48 - ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48 - ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 - ; GCN-NEXT: v_exp_f32_e32 v58, v58 - ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95] - ; GCN-NEXT: v_mul_f32_e64 v98, v98, v48 - ; GCN-NEXT: v_mul_f32_e64 v99, v99, v48 - ; GCN-NEXT: v_mul_f32_e64 v100, v100, v48 - ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48 - ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48 - ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48 - ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59 - ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53 - ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54 - ; GCN-NEXT: v_exp_f32_e32 v59, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111] - ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134 - ; GCN-NEXT: v_mul_f32_e64 v112, v112, v48 - ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48 - ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48 - ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48 - ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134 - ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134 - ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60 - ; GCN-NEXT: ; implicit-def: $vgpr57 - ; GCN-NEXT: ds_read_b128 v[60:63], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v160, v149 - ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148 - ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79] - ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134 - ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134 - ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134 - ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162 - ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163 - ; GCN-NEXT: v_exp_f32_e32 v162, v146 - ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164 - ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134 - ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111] - ; GCN-NEXT: v_exp_f32_e32 v151, v33 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59 - ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 - ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134 - ; GCN-NEXT: v_exp_f32_e32 v153, v33 - ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134 - ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5 - ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161 - ; GCN-NEXT: v_exp_f32_e32 v165, v60 - ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8 - ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v161, v61 - ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8 - ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5 - ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8 - ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8 + ; GCN-NEXT: v_perm_b32 v188, v194, v192, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[164:165], v[144:145], v[80:95] + ; GCN-NEXT: v_perm_b32 v189, v220, v218, s5 + ; GCN-NEXT: v_perm_b32 v191, v220, v218, s7 + ; GCN-NEXT: v_perm_b32 v190, v194, v192, s7 + ; GCN-NEXT: v_perm_b32 v192, v195, v193, s5 + ; GCN-NEXT: v_perm_b32 v194, v195, v193, s7 + ; GCN-NEXT: v_perm_b32 v193, v221, v219, s5 + ; GCN-NEXT: v_perm_b32 v195, v221, v219, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[136:137], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[150:151], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[138:139], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[176:177], v[136:137], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[226:227], v[132:133], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[178:179], v[138:139], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[136:137], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[128:129], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[134:135], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[132:133], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[138:139], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[232:233], v[130:131], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[234:235], v[128:129], v[96:111] + ; GCN-NEXT: s_nop 9 + ; GCN-NEXT: v_mul_f32_e32 v213, s4, v112 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v113 + ; GCN-NEXT: v_max3_f32 v213, v213, s14, v218 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v114 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v115 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v116 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v117 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v118 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v119 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v120 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v121 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79] + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v122 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v123 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v124 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v125 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111] + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v126 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v127 + ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95] + ; GCN-NEXT: s_nop 6 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v96 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v97 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v98 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v99 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v100 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v101 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v102 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v103 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v104 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95] + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v106 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v107 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v108 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v109 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v110 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v111 + ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v140, s4, v80 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v81 + ; GCN-NEXT: v_max3_f32 v140, v213, v140, v141 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v82 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v83 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v84 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v85 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v86 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v87 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v88 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v89 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v90 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v91 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v92 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v93 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v94 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v95 + ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 + ; GCN-NEXT: v_mul_f32_e32 v128, s4, v64 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v65 + ; GCN-NEXT: v_max3_f32 v128, v140, v128, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v66 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v67 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v68 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v69 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v70 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v71 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v72 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v73 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v74 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v75 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v76 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v77 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v78 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v79 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 + ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_max_f32_e32 v129, v129, v129 + ; GCN-NEXT: v_max_f32_e32 v128, v128, v129 + ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v128, v129, v128, s[12:13] + ; GCN-NEXT: v_max_f32_e32 v128, v128, v128 + ; GCN-NEXT: v_max_f32_e32 v128, v212, v128 + ; GCN-NEXT: v_fma_f32 v113, s4, v113, -v128 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v114, -v128 + ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v115, -v128 + ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v116, -v128 + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v117, -v128 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v118, -v128 + ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_fma_f32 v113, s4, v119, -v128 + ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128 + ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112 + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120 + ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128 + ; GCN-NEXT: v_exp_f32_e32 v114, v138 + ; GCN-NEXT: v_exp_f32_e32 v115, v139 + ; GCN-NEXT: v_exp_f32_e32 v116, v140 + ; GCN-NEXT: v_exp_f32_e32 v117, v141 + ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v118 + ; GCN-NEXT: v_exp_f32_e32 v118, v142 + ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v120 + ; GCN-NEXT: v_exp_f32_e32 v120, v144 + ; GCN-NEXT: v_exp_f32_e32 v113, v112 + ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114 + ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116 + ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v122, s4, v123, -v128 + ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119 + ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115 + ; GCN-NEXT: v_mul_f32_e32 v151, 0x3fb8aa3b, v122 + ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117 + ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128 + ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121 + ; GCN-NEXT: v_exp_f32_e32 v112, v129 + ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122 + ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128 + ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v119, v143 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47] + ; GCN-NEXT: v_mul_f32_e64 v20, v20, v112 + ; GCN-NEXT: v_mul_f32_e64 v21, v21, v112 + ; GCN-NEXT: v_mul_f32_e64 v22, v22, v112 + ; GCN-NEXT: v_mul_f32_e64 v23, v23, v112 + ; GCN-NEXT: v_mul_f32_e64 v24, v24, v112 + ; GCN-NEXT: v_mul_f32_e64 v25, v25, v112 + ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pack_b32_f16 v134, v123, v124 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v119 + ; GCN-NEXT: v_fma_f32 v124, s4, v126, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v120 + ; GCN-NEXT: v_exp_f32_e32 v121, v148 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v122, v149 + ; GCN-NEXT: v_pack_b32_f16 v135, v130, v126 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v124 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v121 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125 + ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128 + ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v123, v150 + ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_fma_f32 v143, s4, v101, -v128 + ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128 + ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128 + ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128 + ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v124, v151 + ; GCN-NEXT: ds_read_b128 v[130:133], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v122 + ; GCN-NEXT: v_exp_f32_e32 v96, v129 + ; GCN-NEXT: v_fma_f32 v137, s4, v97, -v128 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v97, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_fma_f32 v137, s4, v98, -v128 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v124 + ; GCN-NEXT: v_fma_f32 v135, s4, v99, -v128 + ; GCN-NEXT: v_exp_f32_e32 v98, v138 + ; GCN-NEXT: v_exp_f32_e32 v99, v127 + ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_pack_b32_f16 v127, v136, v134 + ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15] + ; GCN-NEXT: v_fma_f32 v131, s4, v100, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v96 + ; GCN-NEXT: v_exp_f32_e32 v100, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v97 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111] - ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151 - ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153 - ; GCN-NEXT: v_exp_f32_e32 v159, v33 - ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38 - ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_exp_f32_e32 v152, v38 + ; GCN-NEXT: ds_write_b64 v199, v[188:189] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[60:61] + ; GCN-NEXT: ds_write_b64 v200, v[190:191] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[32:33] - ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: ; implicit-def: $vgpr38 + ; GCN-NEXT: ds_write_b64 v201, v[192:193] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[140:141] - ; GCN-NEXT: v_add_u32_e32 v38, v132, v38 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v33 + ; GCN-NEXT: ds_write_b64 v202, v[194:195] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v101, v125 + ; GCN-NEXT: v_pack_b32_f16 v146, v130, v131 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v98 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31] + ; GCN-NEXT: v_fma_f32 v134, s4, v102, -v128 + ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v134 + ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr36 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v36 - ; GCN-NEXT: ; implicit-def: $vgpr37 - ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v102, v142 + ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v37 - ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165 - ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156 - ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134 - ; GCN-NEXT: ds_read_b128 v[36:39], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v154, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 - ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134 - ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v155, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157 - ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161 - ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159 - ; GCN-NEXT: v_exp_f32_e32 v157, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142 - ; GCN-NEXT: v_exp_f32_e32 v146, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134 - ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40 - ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v147, v36 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v143, v36 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142 - ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157 - ; GCN-NEXT: v_exp_f32_e32 v156, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146 - ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32 - ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v129, v36 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147 - ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: ds_read_b128 v[36:39], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v142, v40 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143 - ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95] - ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v63, v40 - ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61 - ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134 - ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156 - ; GCN-NEXT: v_exp_f32_e32 v158, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129 - ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v128, v17 - ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8 - ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62 - ; GCN-NEXT: v_exp_f32_e32 v167, v36 - ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8 - ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v130, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158 - ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5 - ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v99 + ; GCN-NEXT: v_fma_f32 v127, s4, v103, -v128 + ; GCN-NEXT: v_exp_f32_e32 v103, v150 + ; GCN-NEXT: v_fma_f32 v139, s4, v105, -v128 + ; GCN-NEXT: v_pack_b32_f16 v147, v147, v126 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_perm_b32 v152, v135, v131, s5 + ; GCN-NEXT: v_perm_b32 v154, v135, v131, s7 + ; GCN-NEXT: v_fma_f32 v135, s4, v104, -v128 + ; GCN-NEXT: v_perm_b32 v126, v134, v130, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15] + ; GCN-NEXT: v_perm_b32 v150, v134, v130, s7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v100 + ; GCN-NEXT: v_exp_f32_e32 v104, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v101 + ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_perm_b32 v127, v144, v142, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47] + ; GCN-NEXT: v_pack_b32_f16 v148, v134, v135 + ; GCN-NEXT: v_fma_f32 v135, s4, v106, -v128 + ; GCN-NEXT: v_exp_f32_e32 v105, v125 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v102 + ; GCN-NEXT: v_perm_b32 v151, v144, v142, s7 + ; GCN-NEXT: v_perm_b32 v153, v145, v143, s5 + ; GCN-NEXT: v_perm_b32 v155, v145, v143, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v106, v156 + ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v103 + ; GCN-NEXT: v_fma_f32 v136, s4, v107, -v128 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_pack_b32_f16 v149, v134, v135 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63] + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v136 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v107, v138 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[148:149], v[0:15] + ; GCN-NEXT: v_fma_f32 v131, s4, v108, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v104 + ; GCN-NEXT: v_exp_f32_e32 v108, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47] + ; GCN-NEXT: v_fma_f32 v142, s4, v109, -v128 + ; GCN-NEXT: v_exp_f32_e32 v109, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v142 + ; GCN-NEXT: v_pack_b32_f16 v142, v130, v131 + ; GCN-NEXT: v_fma_f32 v131, s4, v110, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31] + ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v107 + ; GCN-NEXT: v_exp_f32_e32 v110, v156 + ; GCN-NEXT: v_fma_f32 v135, s4, v111, -v128 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_pack_b32_f16 v143, v130, v131 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v111, v146 + ; GCN-NEXT: v_fma_f32 v139, s4, v80, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v108 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v80, v129 + ; GCN-NEXT: ds_read_b128 v[130:133], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v109 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47] + ; GCN-NEXT: v_fma_f32 v144, s4, v81, -v128 + ; GCN-NEXT: v_exp_f32_e32 v81, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v144 + ; GCN-NEXT: v_pack_b32_f16 v144, v138, v139 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v110 + ; GCN-NEXT: v_fma_f32 v137, s4, v82, -v128 + ; GCN-NEXT: v_exp_f32_e32 v82, v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v111 + ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_fma_f32 v137, s4, v83, -v128 + ; GCN-NEXT: v_mul_f32_e32 v157, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v83, v135 + ; GCN-NEXT: v_pack_b32_f16 v145, v136, v134 + ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[20:21] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111] - ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5 - ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v140, v17 - ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5 + ; GCN-NEXT: ds_write_b64 v199, v[126:127] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60 - ; GCN-NEXT: v_exp_f32_e32 v144, v22 + ; GCN-NEXT: ds_write_b64 v200, v[150:151] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[16:17] - ; GCN-NEXT: ; implicit-def: $vgpr17 - ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: ds_write_b64 v201, v[152:153] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[42:43] - ; GCN-NEXT: v_add_u32_e32 v22, v132, v22 - ; GCN-NEXT: v_add_u32_e32 v17, v132, v17 - ; GCN-NEXT: ; implicit-def: $vgpr20 - ; GCN-NEXT: ; implicit-def: $vgpr21 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: ds_write_b64 v202, v[154:155] + ; GCN-NEXT: v_fma_f32 v127, s4, v84, -v128 + ; GCN-NEXT: v_exp_f32_e32 v84, v129 + ; GCN-NEXT: v_fma_f32 v130, s4, v85, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v80 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v85, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v130 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v81 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31] + ; GCN-NEXT: v_fma_f32 v134, s4, v86, -v128 + ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v134 + ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v203, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v20, v132, v20 - ; GCN-NEXT: v_add_u32_e32 v21, v132, v21 - ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44 - ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v132, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v82 + ; GCN-NEXT: v_exp_f32_e32 v86, v156 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v83 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167 - ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134 - ; GCN-NEXT: ds_read_b128 v[20:23], v139 + ; GCN-NEXT: v_fma_f32 v139, s4, v87, -v128 + ; GCN-NEXT: v_exp_f32_e32 v87, v157 + ; GCN-NEXT: v_pack_b32_f16 v127, v127, v138 + ; GCN-NEXT: v_fma_f32 v138, s4, v89, -v128 + ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15] + ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: v_perm_b32 v154, v135, v131, s5 + ; GCN-NEXT: v_perm_b32 v156, v135, v131, s7 + ; GCN-NEXT: v_fma_f32 v135, s4, v88, -v128 + ; GCN-NEXT: v_perm_b32 v150, v134, v130, s5 + ; GCN-NEXT: v_perm_b32 v152, v134, v130, s7 + ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v84 + ; GCN-NEXT: v_exp_f32_e32 v88, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v85 + ; GCN-NEXT: v_perm_b32 v151, v146, v142, s5 + ; GCN-NEXT: v_perm_b32 v153, v146, v142, s7 + ; GCN-NEXT: v_perm_b32 v155, v147, v143, s5 + ; GCN-NEXT: v_perm_b32 v157, v147, v143, s7 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v89, v125 + ; GCN-NEXT: v_pack_b32_f16 v146, v134, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v86 + ; GCN-NEXT: v_fma_f32 v135, s4, v90, -v128 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v138 + ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v90, v158 + ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v87 + ; GCN-NEXT: v_fma_f32 v127, s4, v91, -v128 + ; GCN-NEXT: v_exp_f32_e32 v91, v139 + ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_pack_b32_f16 v147, v134, v126 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] + ; GCN-NEXT: v_fma_f32 v130, s4, v92, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v88 + ; GCN-NEXT: v_exp_f32_e32 v92, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v130 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v89 + ; GCN-NEXT: v_fma_f32 v131, s4, v93, -v128 + ; GCN-NEXT: v_pack_b32_f16 v130, v126, v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v93, v125 + ; GCN-NEXT: v_fma_f32 v126, s4, v94, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v125, v90 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v126 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v91 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_fma_f32 v131, s4, v95, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v94, v148 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v95, v127 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v92 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_pack_b32_f16 v131, v125, v126 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v125, v129 + ; GCN-NEXT: ds_read_b128 v[132:135], v197 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[36:39], v139 offset:576 + ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_fma_f32 v65, s4, v66, -v128 + ; GCN-NEXT: v_exp_f32_e32 v126, v142 + ; GCN-NEXT: v_pack_b32_f16 v142, v127, v64 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v94 + ; GCN-NEXT: v_mul_f32_e32 v145, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v95 + ; GCN-NEXT: v_fma_f32 v66, s4, v67, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v127, v143 + ; GCN-NEXT: v_pack_b32_f16 v143, v64, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v129, v138 + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v66 + ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[136:139], v197 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v62, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130 - ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134 - ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134 - ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134 - ; GCN-NEXT: ; implicit-def: $sgpr0 - ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140 - ; GCN-NEXT: v_exp_f32_e32 v145, v16 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141 - ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46 - ; GCN-NEXT: v_exp_f32_e32 v35, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v34, s4, v27, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v24 - ; GCN-NEXT: ds_read_b128 v[24:27], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v46, v20 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v47 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v132 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v47, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v62 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v34 - ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145 - ; GCN-NEXT: v_exp_f32_e32 v141, v16 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35 - ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134 - ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16 - ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v33, v20 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: ds_read_b128 v[20:23], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v36, v24 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47 - ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95] - ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_exp_f32_e32 v39, v24 - ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37 - ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141 - ; GCN-NEXT: v_exp_f32_e32 v148, v1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33 - ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127] - ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134 - ; GCN-NEXT: v_exp_f32_e32 v34, v1 - ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8 - ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38 - ; GCN-NEXT: v_exp_f32_e32 v150, v20 - ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v38, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39 - ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134 - ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5 - ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[4:5] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111] - ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5 - ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_exp_f32_e32 v42, v1 - ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5 + ; GCN-NEXT: ds_write_b64 v199, v[150:151] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[20:21] + ; GCN-NEXT: ds_write_b64 v200, v[152:153] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v125 + ; GCN-NEXT: v_exp_f32_e32 v130, v158 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[0:1] + ; GCN-NEXT: ds_write_b64 v201, v[154:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[26:27] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6 - ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32 + ; GCN-NEXT: ds_write_b64 v202, v[156:157] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_pack_b32_f16 v16, v37, v28 - ; GCN-NEXT: v_fma_f32 v24, s4, v7, -v134 - ; GCN-NEXT: v_exp_f32_e32 v25, v6 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[4:7], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149 - ; GCN-NEXT: v_exp_f32_e32 v26, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150 - ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38 - ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v28, s4, v9, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v29, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v41 - ; GCN-NEXT: v_fma_f32 v30, s4, v10, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v42 - ; GCN-NEXT: v_exp_f32_e32 v31, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v17, v2, v0 - ; GCN-NEXT: v_pack_b32_f16 v16, v1, v27 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24 - ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134 - ; GCN-NEXT: v_exp_f32_e32 v19, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8 - ; GCN-NEXT: ds_read_b128 v[8:11], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v24, v4 - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v26 - ; GCN-NEXT: v_exp_f32_e32 v27, v4 - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95] - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 - ; GCN-NEXT: v_fma_f32 v21, s4, v13, -v134 - ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31 - ; GCN-NEXT: v_exp_f32_e32 v30, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 - ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v16, v4 - ; GCN-NEXT: v_pack_b32_f16 v0, v5, v20 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v12 - ; GCN-NEXT: v_exp_f32_e32 v18, v9 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21 - ; GCN-NEXT: v_exp_f32_e32 v21, v9 - ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: ds_read_b128 v[4:7], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95] - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30 - ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_exp_f32_e32 v2, v2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127] - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_exp_f32_e32 v10, v1 - ; GCN-NEXT: v_pack_b32_f16 v8, v17, v20 - ; GCN-NEXT: v_pack_b32_f16 v9, v3, v0 - ; GCN-NEXT: v_add_f32_e32 v3, 0, v49 - ; GCN-NEXT: v_add_f32_e32 v3, v50, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v51, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v52, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v53, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v54, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v55, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v56, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v58, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v164, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v59, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v162, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v151, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v165, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v161, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v159, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v152, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v154, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v157, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v147, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v156, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v63, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v158, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v128, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v130, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v140, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v144, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v132, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v62, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v145, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v35, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v46, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v47, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v33, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v36, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v39, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95] - ; GCN-NEXT: v_add_f32_e32 v3, v34, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 - ; GCN-NEXT: v_add_f32_e32 v3, v38, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v42, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v25, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v26, v3 - ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22 - ; GCN-NEXT: v_add_f32_e32 v3, v29, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v31, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95] - ; GCN-NEXT: v_add_f32_e32 v3, v19, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v24, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v27, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v30, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v16, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v18, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v21, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: v_add_f32_e32 v0, v2, v3 - ; GCN-NEXT: v_add_f32_e32 v4, v10, v0 - ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v126 + ; GCN-NEXT: v_exp_f32_e32 v131, v144 + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_fma_f32 v69, s4, v71, -v128 + ; GCN-NEXT: v_pack_b32_f16 v140, v132, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v129 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v127 + ; GCN-NEXT: v_exp_f32_e32 v132, v145 + ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128 + ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_fma_f32 v145, s4, v73, -v128 + ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v145 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v133, v141 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_pack_b32_f16 v141, v64, v68 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[68:71], v198 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v143, s4, v72, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v72, v146 + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_cvt_f16_f32_e32 v143, v131 + ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pack_b32_f16 v64, v64, v143 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v73, v144 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v132 + ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128 + ; GCN-NEXT: v_exp_f32_e32 v74, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v133 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63] + ; GCN-NEXT: v_fma_f32 v138, s4, v75, -v128 + ; GCN-NEXT: v_exp_f32_e32 v75, v142 + ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v138 + ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] + ; GCN-NEXT: v_fma_f32 v68, s4, v76, -v128 + ; GCN-NEXT: v_exp_f32_e32 v76, v146 + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v73 + ; GCN-NEXT: v_fma_f32 v69, s4, v77, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v77, v147 + ; GCN-NEXT: v_pack_b32_f16 v134, v66, v68 + ; GCN-NEXT: v_fma_f32 v68, s4, v78, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v74 + ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v78, v67 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v76 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v75 + ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v128 + ; GCN-NEXT: v_exp_f32_e32 v79, v148 + ; GCN-NEXT: v_mul_f32_e32 v128, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_pack_b32_f16 v135, v66, v64 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v142, v146 + ; GCN-NEXT: ds_read_b128 v[68:71], v197 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v137, v147 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v77 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v138, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v78 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] + ; GCN-NEXT: s_nop 10 + ; GCN-NEXT: v_exp_f32_e32 v52, v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v142 + ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v52 + ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v79 + ; GCN-NEXT: v_pack_b32_f16 v50, v51, v50 + ; GCN-NEXT: v_pack_b32_f16 v48, v139, v136 + ; GCN-NEXT: v_pack_b32_f16 v51, v54, v53 + ; GCN-NEXT: v_add_f32_e32 v53, 0, v113 + ; GCN-NEXT: v_add_f32_e32 v53, v114, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v115, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v116, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v117, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v118, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v119, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v120, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v121, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v122, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v123, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v124, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v96, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v97, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v98, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v99, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v100, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v101, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v102, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v103, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v104, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v105, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v106, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v107, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v108, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v109, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v110, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v111, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v80, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v81, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v82, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v83, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v84, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v85, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v86, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v87, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v88, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v89, v53 + ; GCN-NEXT: v_pack_b32_f16 v49, v140, v49 + ; GCN-NEXT: v_add_f32_e32 v53, v90, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v91, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15] + ; GCN-NEXT: v_add_f32_e32 v53, v92, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v93, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v94, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v95, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v125, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v126, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v127, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v129, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47] + ; GCN-NEXT: s_nop 9 + ; GCN-NEXT: v_add_f32_e32 v0, v130, v53 + ; GCN-NEXT: v_add_f32_e32 v0, v131, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v132, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v133, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v72, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v73, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v74, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v75, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v76, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v77, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v78, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v79, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v142, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v137, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v138, v0 + ; GCN-NEXT: v_add_f32_e32 v4, v52, v0 + ; GCN-NEXT: ds_bpermute_b32 v5, v196, v4 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31] ; GCN-NEXT: v_add_f32_e32 v2, v4, v5 - ; GCN-NEXT: ds_bpermute_b32 v3, v133, v2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111] - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7] + ; GCN-NEXT: ds_bpermute_b32 v3, v196, v2 ; GCN-NEXT: ; implicit-def: $vgpr4 - ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[12:13] + ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v112 + ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47] ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 7959cee..e174fc1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -156,62 +156,62 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 -; GCN-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[0:3], v1 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v3 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[128:131], v3 offset:8192 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index aa099b6..b65a1a8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -623,62 +623,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[128:131], v1 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: ds_read_b128 a[60:63], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v2 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v2 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v2 offset:57392 -; GCN-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[128:131], v3 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576 +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152 +; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; GCN-NEXT: s_waitcnt lgkmcnt(14) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] ; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; GCN-NEXT: s_nop 12 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; GCN-NEXT: s_nop 11 ; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 @@ -729,62 +729,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:24576 -; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 1.0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v2 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v2 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v2 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v2 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v2 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v2 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v2 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v2 offset:57392 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v3, s0, v0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v3 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v3 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v3 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v3 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v3 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v3 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:24576 +; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:49152 +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; EXACTCUTOFF-NEXT: s_nop 12 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; EXACTCUTOFF-NEXT: s_nop 11 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll index 56f9c5d..d578d2e 100644 --- a/llvm/test/CodeGen/AMDGPU/minmax.ll +++ b/llvm/test/CodeGen/AMDGPU/minmax.ll @@ -612,10 +612,10 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z) ; GFX1250-NEXT: v_med3_num_f32 v2, v2, v3, v4 ; GFX1250-NEXT: global_store_b32 v[0:1], v2, off ; GFX1250-NEXT: s_set_pc_i64 s[30:31] - %tmp0 = call float @llvm.minnum.f32(float %x, float %y) - %tmp1 = call float @llvm.maxnum.f32(float %x, float %y) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z) - %tmp3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minnum.f32(float %x, float %y) + %tmp1 = call nnan float @llvm.maxnum.f32(float %x, float %y) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %z) + %tmp3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) store float %tmp3, ptr addrspace(1) %arg ret void } @@ -646,10 +646,10 @@ define void @test_med3_minimumnum_maximumnum_f32(ptr addrspace(1) %arg, float %x ; GFX1250-NEXT: v_med3_num_f32 v2, v2, v3, v4 ; GFX1250-NEXT: global_store_b32 v[0:1], v2, off ; GFX1250-NEXT: s_set_pc_i64 s[30:31] - %tmp0 = call float @llvm.minimumnum.f32(float %x, float %y) - %tmp1 = call float @llvm.maximumnum.f32(float %x, float %y) - %tmp2 = call float @llvm.minimumnum.f32(float %tmp1, float %z) - %tmp3 = call float @llvm.maximumnum.f32(float %tmp0, float %tmp2) + %tmp0 = call nnan float @llvm.minimumnum.f32(float %x, float %y) + %tmp1 = call nnan float @llvm.maximumnum.f32(float %x, float %y) + %tmp2 = call nnan float @llvm.minimumnum.f32(float %tmp1, float %z) + %tmp3 = call nnan float @llvm.maximumnum.f32(float %tmp0, float %tmp2) store float %tmp3, ptr addrspace(1) %arg ret void } @@ -1280,10 +1280,10 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0 ; GISEL-GFX1250-FAKE16-NEXT: v_med3_num_f16 v2, v2, v3, v4 ; GISEL-GFX1250-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GISEL-GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] - %tmp0 = call half @llvm.minnum.f16(half %x, half %y) - %tmp1 = call half @llvm.maxnum.f16(half %x, half %y) - %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z) - %tmp3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2) + %tmp0 = call nnan half @llvm.minnum.f16(half %x, half %y) + %tmp1 = call nnan half @llvm.maxnum.f16(half %x, half %y) + %tmp2 = call nnan half @llvm.minnum.f16(half %tmp1, half %z) + %tmp3 = call nnan half @llvm.maxnum.f16(half %tmp0, half %tmp2) store half %tmp3, ptr addrspace(1) %arg ret void } diff --git a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll index ddbae64..a95d8c7 100644 --- a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll +++ b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll @@ -1,8 +1,8 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck %s -check-prefixes=GCN,GFX700 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900 +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100 +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100 declare i64 @llvm.readsteadycounter() #0 diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 9a23788..8803f3a 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -367,77 +367,76 @@ bb: define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK-LABEL: illegal_mfma_after_rewrite: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[4:5] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[16:19] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, s1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7] -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; CHECK-NEXT: v_mov_b32_e32 v5, v4 -; CHECK-NEXT: v_mov_b32_e32 v6, v4 -; CHECK-NEXT: v_mov_b32_e32 v7, v4 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9] +; CHECK-NEXT: s_nop 3 +; CHECK-NEXT: v_cvt_f16_f32_e32 v24, v4 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3] ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[4:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7] -; CHECK-NEXT: s_nop 5 -; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15] -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3] -; CHECK-NEXT: global_store_short v[12:13], v17, off +; CHECK-NEXT: v_mov_b32_e32 v8, 0x7fc00000 +; CHECK-NEXT: v_mov_b32_e32 v9, v8 +; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v8 +; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v6 +; CHECK-NEXT: v_mov_b64_e32 v[0:1], 0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11] +; CHECK-NEXT: global_store_short v[0:1], v2, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: global_store_short v[12:13], v9, off -; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19] +; CHECK-NEXT: s_nop 5 +; CHECK-NEXT: v_cvt_f16_f32_e32 v10, v6 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15] +; CHECK-NEXT: global_store_short v[0:1], v10, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0 -; CHECK-NEXT: global_store_short v[12:13], v1, off -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23] +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CHECK-NEXT: global_store_short v[0:1], v6, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[12:13], v14, off +; CHECK-NEXT: global_store_short v[0:1], v24, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5] ; CHECK-NEXT: s_nop 6 -; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7] -; CHECK-NEXT: global_store_short v[12:13], v8, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v2 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19] +; CHECK-NEXT: global_store_short v[0:1], v6, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 ; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CHECK-NEXT: global_store_short v[12:13], v0, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CHECK-NEXT: global_store_short v[0:1], v2, off ; CHECK-NEXT: s_endpgm entry: %k0 = call <4 x float> asm sideeffect "; def $0", "=s"() @@ -546,100 +545,14 @@ define void @test_rewrite_mfma_subreg_insert2(double %arg0, double %arg1, ptr ad define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) #0 { ; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class: ; CHECK: ; %bb.0: +; CHECK-NEXT: v_accvgpr_write_b32 a34, 2.0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 -; CHECK-NEXT: v_accvgpr_read_b32 v63, a31 -; CHECK-NEXT: v_accvgpr_read_b32 v62, a30 -; CHECK-NEXT: v_accvgpr_read_b32 v61, a29 -; CHECK-NEXT: v_accvgpr_read_b32 v60, a28 -; CHECK-NEXT: v_accvgpr_read_b32 v59, a27 -; CHECK-NEXT: v_accvgpr_read_b32 v58, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v57, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v56, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v55, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v54, a22 -; CHECK-NEXT: v_accvgpr_read_b32 v53, a21 -; CHECK-NEXT: v_accvgpr_read_b32 v52, a20 -; CHECK-NEXT: v_accvgpr_read_b32 v51, a19 -; CHECK-NEXT: v_accvgpr_read_b32 v50, a18 -; CHECK-NEXT: v_accvgpr_read_b32 v49, a17 -; CHECK-NEXT: v_accvgpr_read_b32 v48, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v47, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v46, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v45, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v44, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v43, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v42, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v41, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v40, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v39, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v38, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v37, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v36, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v35, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v34, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v33, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v32, a0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, 2.0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, 4.0 -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[32:63] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v32 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v33 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v34 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v35 -; CHECK-NEXT: v_accvgpr_write_b32 a4, v36 -; CHECK-NEXT: v_accvgpr_write_b32 a5, v37 -; CHECK-NEXT: v_accvgpr_write_b32 a6, v38 -; CHECK-NEXT: v_accvgpr_write_b32 a7, v39 -; CHECK-NEXT: v_accvgpr_write_b32 a8, v40 -; CHECK-NEXT: v_accvgpr_write_b32 a9, v41 -; CHECK-NEXT: v_accvgpr_write_b32 a10, v42 -; CHECK-NEXT: v_accvgpr_write_b32 a11, v43 -; CHECK-NEXT: v_accvgpr_write_b32 a12, v44 -; CHECK-NEXT: v_accvgpr_write_b32 a13, v45 -; CHECK-NEXT: v_accvgpr_write_b32 a14, v46 -; CHECK-NEXT: v_accvgpr_write_b32 a15, v47 -; CHECK-NEXT: v_accvgpr_write_b32 a16, v48 -; CHECK-NEXT: v_accvgpr_write_b32 a17, v49 -; CHECK-NEXT: v_accvgpr_write_b32 a18, v50 -; CHECK-NEXT: v_accvgpr_write_b32 a19, v51 -; CHECK-NEXT: v_accvgpr_write_b32 a20, v52 -; CHECK-NEXT: v_accvgpr_write_b32 a21, v53 -; CHECK-NEXT: v_accvgpr_write_b32 a22, v54 -; CHECK-NEXT: v_accvgpr_write_b32 a23, v55 -; CHECK-NEXT: v_accvgpr_write_b32 a24, v56 -; CHECK-NEXT: v_accvgpr_write_b32 a25, v57 -; CHECK-NEXT: v_accvgpr_write_b32 a26, v58 -; CHECK-NEXT: v_accvgpr_write_b32 a27, v59 -; CHECK-NEXT: v_accvgpr_write_b32 a28, v60 -; CHECK-NEXT: v_accvgpr_write_b32 a29, v61 -; CHECK-NEXT: v_accvgpr_write_b32 a30, v62 -; CHECK-NEXT: v_accvgpr_write_b32 a31, v63 -; CHECK-NEXT: v_mov_b32_e32 v33, 0x41000000 -; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000 -; CHECK-NEXT: v_accvgpr_read_b32 v32, a32 -; CHECK-NEXT: v_and_b32_e32 v32, 0x3ff, v32 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31] -; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v32 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] -; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 @@ -663,18 +576,60 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: v_accvgpr_read_b32 v21, a21 ; CHECK-NEXT: v_accvgpr_read_b32 v22, a22 ; CHECK-NEXT: v_accvgpr_read_b32 v23, a23 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 ; CHECK-NEXT: v_accvgpr_read_b32 v28, a28 ; CHECK-NEXT: v_accvgpr_read_b32 v29, a29 ; CHECK-NEXT: v_accvgpr_read_b32 v30, a30 ; CHECK-NEXT: v_accvgpr_read_b32 v31, a31 -; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 -; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 -; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 -; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 -; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 -; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] -; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 +; CHECK-NEXT: v_accvgpr_write_b32 a33, 4.0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a34, a33, v[0:31] +; CHECK-NEXT: v_mov_b32_e32 v1, 0x41000000 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a32 +; CHECK-NEXT: s_nop 15 +; CHECK-NEXT: v_mov_b64_e32 v[2:3], v[32:33] +; CHECK-NEXT: v_mov_b64_e32 v[4:5], v[34:35] +; CHECK-NEXT: v_mov_b64_e32 v[6:7], v[36:37] +; CHECK-NEXT: v_mov_b64_e32 v[8:9], v[38:39] +; CHECK-NEXT: v_mov_b64_e32 v[10:11], v[40:41] +; CHECK-NEXT: v_mov_b64_e32 v[12:13], v[42:43] +; CHECK-NEXT: v_mov_b64_e32 v[14:15], v[44:45] +; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[46:47] +; CHECK-NEXT: v_mov_b64_e32 v[18:19], v[48:49] +; CHECK-NEXT: v_mov_b64_e32 v[20:21], v[50:51] +; CHECK-NEXT: v_mov_b64_e32 v[22:23], v[52:53] +; CHECK-NEXT: v_mov_b64_e32 v[24:25], v[54:55] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[56:57] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[58:59] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[60:61] +; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[62:63] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, v[22:25], s[0:1] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, v[18:21], s[0:1] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x41800000 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] +; CHECK-NEXT: s_nop 15 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[2:3] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[2:3] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[2:3] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[2:3] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 ; CHECK-NEXT: s_endpgm %src2 = call <32 x float> asm sideeffect "; def $0", "=a"() %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir new file mode 100644 index 0000000..33b2f69 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir @@ -0,0 +1,32 @@ +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler --misched-prera-direction=topdown -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Check that cycle counts are consistent with hazards. + +# CHECK: Cycle: 3 TopQ.A +# CHECK: hazard: SU(6) HWXDL[0]=9c, is later than CurrCycle = 3c +# CHECK-NOT: Cycle: 9 TopQ.A +# CHECK: Cycle: 83 TopQ.A +# CHECK: Checking pending node SU(6) +# CHECK: Move SU(6) into Available Q + +--- +name: pending_queue_ready_cycle +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr4_sgpr5 + + %2:sgpr_128 = IMPLICIT_DEF + %14:vgpr_32 = IMPLICIT_DEF + %15:vgpr_32 = IMPLICIT_DEF + %18:areg_512 = IMPLICIT_DEF + %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, implicit $exec + %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec + undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %5.sub0, %14, implicit $exec + %7:vreg_512 = COPY %18 + SCHED_BARRIER 0 + S_NOP 0, implicit %18, implicit %7, implicit %84 + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll index a2d6ca9..972a470 100644 --- a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll +++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll @@ -27,7 +27,7 @@ entry: !1 = !{i64 0, !"_ZTSFivE.generalized"} !2 = !{i64 0, !"_ZTSFviE.generalized"} -; CHECK: .section .callgraph,"o",%progbits,.text +; CHECK: .section .llvm.callgraph,"o",%progbits,.text ;; Version ; CHECK-NEXT: .byte 0 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0. diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll index bf5249e..ec8d5b8 100644 --- a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll +++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll @@ -1,8 +1,8 @@ ;; Test if temporary labels are generated for each indirect callsite. -;; Test if the .callgraph section contains the MD5 hash of callees' type (type id) +;; Test if the .llvm.callgraph section contains the MD5 hash of callees' type (type id) ;; is correctly paired with its corresponding temporary label generated for indirect ;; call sites annotated with !callee_type metadata. -;; Test if the .callgraph section contains unique direct callees. +;; Test if the .llvm.callgraph section contains unique direct callees. ; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -o - < %s | FileCheck %s @@ -36,7 +36,7 @@ entry: !4 = !{!5} !5 = !{i64 0, !"_ZTSFPvS_E.generalized"} -; CHECK: .section .callgraph,"o",%progbits,.text +; CHECK: .section .llvm.callgraph,"o",%progbits,.text ;; Version ; CHECK-NEXT: .byte 0 ;; Flags diff --git a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll index d577603..8036004 100644 --- a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll +++ b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll @@ -1,7 +1,7 @@ -;; Tests that we store the type identifiers in .callgraph section of the object file for tailcalls. +;; Tests that we store the type identifiers in .llvm.callgraph section of the object file for tailcalls. ; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -filetype=obj -o - < %s | \ -; RUN: llvm-readelf -x .callgraph - | FileCheck %s +; RUN: llvm-readelf -x .llvm.callgraph - | FileCheck %s define i32 @check_tailcall(ptr %func, i8 %x) !type !0 { entry: @@ -27,7 +27,7 @@ declare !type !2 i32 @bar(i8 signext) !2 = !{i64 0, !"_ZTSFicE.generalized"} !3 = !{i64 0, !"_ZTSFiiE.generalized"} -; CHECK: Hex dump of section '.callgraph': +; CHECK: Hex dump of section '.llvm.callgraph': ; CHECK-NEXT: 0x00000000 00050000 00008e19 0b7f3326 e3000154 ; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05100000 00a150b8 ;; Verify that the type id 0x308e4b8159bc8654 is in section. diff --git a/llvm/test/CodeGen/ARM/call-graph-section.ll b/llvm/test/CodeGen/ARM/call-graph-section.ll index 928a1067..167cc6f 100644 --- a/llvm/test/CodeGen/ARM/call-graph-section.ll +++ b/llvm/test/CodeGen/ARM/call-graph-section.ll @@ -1,7 +1,7 @@ -;; Tests that we store the type identifiers in .callgraph section of the object file. +;; Tests that we store the type identifiers in .llvm.callgraph section of the object file. ; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -filetype=obj -o - < %s | \ -; RUN: llvm-readelf -x .callgraph - | FileCheck %s +; RUN: llvm-readelf -x .llvm.callgraph - | FileCheck %s declare !type !0 void @foo() @@ -31,7 +31,7 @@ entry: ;; Make sure following type IDs are in call graph section ;; 0x5eecb3e2444f731f, 0x814b8e305486bc59, 0xf897fd777ade6814 -; CHECK: Hex dump of section '.callgraph': +; CHECK: Hex dump of section '.llvm.callgraph': ; CHECK-NEXT: 0x00000000 00050000 00000000 00000000 00000324 ; CHECK-NEXT: 0x00000010 44f731f5 eecb3e54 86bc5981 4b8e307a ; CHECK-NEXT: 0x00000020 de6814f8 97fd77 diff --git a/llvm/test/CodeGen/ARM/nnan-fsub.ll b/llvm/test/CodeGen/ARM/nnan-fsub.ll index 0183908..78dd36f 100644 --- a/llvm/test/CodeGen/ARM/nnan-fsub.ll +++ b/llvm/test/CodeGen/ARM/nnan-fsub.ll @@ -1,18 +1,22 @@ -; RUN: llc -mcpu=cortex-a9 < %s | FileCheck -check-prefix=SAFE %s -; RUN: llc -mcpu=cortex-a9 --enable-no-nans-fp-math < %s | FileCheck -check-prefix=FAST %s +; RUN: llc -mcpu=cortex-a9 < %s | FileCheck %s target triple = "armv7-apple-ios" -; SAFE: test -; FAST: test +; CHECK-LABEL: test define float @test(float %x, float %y) { entry: -; SAFE: vmul.f32 -; SAFE: vsub.f32 -; FAST: mov r0, #0 +; CHECK: vmul.f32 +; CHECK-NEXT: vsub.f32 %0 = fmul float %x, %y %1 = fsub float %0, %0 ret float %1 } - +; CHECK-LABEL: test_nnan +define float @test_nnan(float %x, float %y) { +entry: +; CHECK: mov r0, #0 + %0 = fmul float %x, %y + %1 = fsub nnan float %0, %0 + ret float %1 +} diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll index a78fdd5..f1486f97 100644 --- a/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll +++ b/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll @@ -74,7 +74,7 @@ entry: ; CHECK: [[UPTO1:%.*]] = insertelement <3 x double> [[UPTO0]], double [[Y]], i32 1 ; CHECK: [[UPTO2:%.*]] = insertelement <3 x double> [[UPTO1]], double [[Z]], i32 2 ; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A2_COPY:%.*]], i32 0 -; CHECK: store <3 x double> [[UPTO2]], ptr [[DEST]], align 32 +; CHECK: store <3 x double> [[UPTO2]], ptr [[DEST]], align 8 ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 5) ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0 ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1 @@ -83,9 +83,9 @@ entry: ; CHECK: [[UPTO0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0 ; CHECK: [[UPTO1:%.*]] = insertelement <3 x double> [[UPTO0]], double [[Y]], i32 1 ; CHECK: [[UPTO2:%.*]] = insertelement <3 x double> [[UPTO1]], double [[Z]], i32 2 -; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A2_COPY]], i32 32 -; CHECK: store <3 x double> [[UPTO2]], ptr [[DEST]], align 32 - call void @llvm.memcpy.p0.p2.i32(ptr align 32 %a2.copy, ptr addrspace(2) align 32 @a2, i32 64, i1 false) +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A2_COPY]], i32 24 +; CHECK: store <3 x double> [[UPTO2]], ptr [[DEST]], align 8 + call void @llvm.memcpy.p0.p2.i32(ptr align 32 %a2.copy, ptr addrspace(2) align 32 @a2, i32 48, i1 false) ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4 ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 7) diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll index 7ba2ed2..f1d28e2 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll @@ -19,11 +19,11 @@ target triple = "dxil-pc-shadermodel6.6-compute" ; PRINT:; Resource Bindings: ; PRINT-NEXT:; -; PRINT-NEXT:; Name Type Format Dim ID HLSL Bind Count -; PRINT-NEXT:; ------------------------------ ---------- ------- ----------- ------- -------------- ------ -; PRINT-NEXT:; CB1 cbuffer NA NA CB0 cb0 1 -; PRINT-NEXT:; CB2 cbuffer NA NA CB1 cb1 1 -; PRINT-NEXT:; MyConstants cbuffer NA NA CB2 cb5,space15 1 +; PRINT-NEXT:; Name Type Format Dim ID HLSL Bind Count +; PRINT-NEXT:; ---- +; PRINT-NEXT:; CB1 cbuffer NA NA CB0 cb0 1 +; PRINT-NEXT:; CB2 cbuffer NA NA CB1 cb1 1 +; PRINT-NEXT:; MyConstants cbuffer NA NA CB2 cb5,space15 1 define void @test() #0 { diff --git a/llvm/test/CodeGen/DirectX/bufferGetDimensions.ll b/llvm/test/CodeGen/DirectX/bufferGetDimensions.ll new file mode 100644 index 0000000..ff03bf1 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/bufferGetDimensions.ll @@ -0,0 +1,16 @@ +; RUN: opt -S -dxil-op-lower %s | FileCheck %s + +target triple = "dxil-pc-shadermodel6.6-compute" + +define i32 @test_getdimensions_no_mips() { + ; CHECK: %[[HANDLE:.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, + ; CHECK-NEXT: %[[ANNOT_HANDLE:.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[HANDLE]] + %handle = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + + ; CHECK-NEXT: %[[RETVAL:.*]] = call %dx.types.Dimensions @dx.op.getDimensions(i32 72, %dx.types.Handle %[[ANNOT_HANDLE]], i32 undef) + ; CHECK-NEXT: %[[DIM:.*]] = extractvalue %dx.types.Dimensions %[[RETVAL]], 0 + %1 = call i32 @llvm.dx.resource.getdimensions.x(target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %handle) + + ; CHECK-NEXT: ret i32 %[[DIM]] + ret i32 %1 +} diff --git a/llvm/test/CodeGen/Hexagon/swp-many-stores.mir b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir new file mode 100644 index 0000000..bf14dcf --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir @@ -0,0 +1,88 @@ +# RUN: llc -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null -pipeliner-max-num-stores=5 2>&1 | FileCheck %s +# REQUIRES: asserts + +# This loop has six stores, which exceeds the limit set by +# `pipeliner-max-num-stores`. + +# CHECK: Too many stores + +--- | + target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" + target triple = "hexagon-unknown-linux-musl" + + define void @f(ptr %a, i32 %n) #0 { + entry: + %guard = icmp sgt i32 %n, 0 + %btc = sub nsw i32 %n, 1 + br i1 %guard, label %loop.preheader, label %exit + + loop.preheader: ; preds = %entry + %0 = add i32 %n, 1 + %cgep = getelementptr i8, ptr %a, i32 %0 + br label %loop + + loop: ; preds = %loop.preheader, %loop + %lsr.iv = phi ptr [ %cgep, %loop.preheader ], [ %cgep8, %loop ] + %i = phi i32 [ %i.dec, %loop ], [ %btc, %loop.preheader ] + %cgep7 = getelementptr i8, ptr %lsr.iv, i32 -2 + store i8 0, ptr %cgep7, align 1 + %cgep8 = getelementptr i8, ptr %lsr.iv, i32 -1 + store i8 1, ptr %cgep8, align 1 + store i8 2, ptr %lsr.iv, align 1 + %cgep9 = getelementptr i8, ptr %lsr.iv, i32 1 + store i8 3, ptr %cgep9, align 1 + %cgep10 = getelementptr i8, ptr %lsr.iv, i32 2 + store i8 4, ptr %cgep10, align 1 + %cgep11 = getelementptr i8, ptr %lsr.iv, i32 3 + store i8 5, ptr %cgep11, align 1 + %i.dec = sub i32 %i, 1 + %ec = icmp eq i32 %i.dec, 0 + br i1 %ec, label %exit, label %loop + + exit: ; preds = %loop, %entry + ret void + } + + attributes #0 = { "target-cpu"="hexagonv79" } +... +--- +name: f +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $r0, $r1 + + %7:intregs = COPY $r1 + %6:intregs = COPY $r0 + %8:predregs = C2_cmpgti %7, 0 + J2_jumpf %8, %bb.3, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.loop.preheader: + successors: %bb.2(0x80000000) + + %0:intregs = A2_addi %7, -1 + %1:intregs = S4_addaddi %7, %6, 1 + %10:intregs = A2_tfrsi 0 + %11:intregs = A2_tfrsi 1 + %14:intregs = COPY %0 + J2_loop0r %bb.2, %14, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + + bb.2.loop (machine-block-address-taken): + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + + %2:intregs = PHI %1, %bb.1, %4, %bb.2 + S2_storerb_io %2, -2, %10 :: (store (s8) into %ir.cgep7) + %4:intregs = A2_addi %2, -1 + S2_storerb_io %2, -1, %11 :: (store (s8) into %ir.cgep8) + S4_storeirb_io %2, 0, 2 :: (store (s8) into %ir.lsr.iv) + S4_storeirb_io %2, 1, 3 :: (store (s8) into %ir.cgep9) + S4_storeirb_io %2, 2, 4 :: (store (s8) into %ir.cgep10) + S4_storeirb_io %2, 3, 5 :: (store (s8) into %ir.cgep11) + ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.3, implicit-def dead $pc + + bb.3.exit: + PS_jmpret $r31, implicit-def dead $pc +... diff --git a/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll new file mode 100644 index 0000000..e67d031 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; The addition of vector `A` with vector of 1s currently uses `vspltisw` to generate vector of 1s followed by add operation. + +; Function for the vector type v2i64 `a + {1, 1}` +define <2 x i64> @test_v2i64(<2 x i64> %a) { +; CHECK-LABEL: test_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vspltisw v3, 1 +; CHECK-NEXT: vupklsw v3, v3 +; CHECK-NEXT: vaddudm v2, v2, v3 +; CHECK-NEXT: blr +entry: + %add = add <2 x i64> %a, splat (i64 1) + ret <2 x i64> %add +} + +; Function for the vector type v4i32 `a + {1, 1, 1, 1}` +define <4 x i32> @test_v4i32(<4 x i32> %a) { +; CHECK-LABEL: test_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vspltisw v3, 1 +; CHECK-NEXT: vadduwm v2, v2, v3 +; CHECK-NEXT: blr +entry: + %add = add <4 x i32> %a, splat (i32 1) + ret <4 x i32> %add +} + +; Function for the vector type v8i16 `a + {1, 1, 1, 1, 1, 1, 1, 1}` +define <8 x i16> @test_v8i16(<8 x i16> %a) { +; CHECK-LABEL: test_v8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vspltish v3, 1 +; CHECK-NEXT: vadduhm v2, v2, v3 +; CHECK-NEXT: blr +entry: + %add = add <8 x i16> %a, splat (i16 1) + ret <8 x i16> %add +} + +; Function for the vector type v16i8 `a + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}` +define <16 x i8> @test_16i8(<16 x i8> %a) { +; CHECK-LABEL: test_16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxspltib v3, 1 +; CHECK-NEXT: vaddubm v2, v2, v3 +; CHECK-NEXT: blr +entry: + %add = add <16 x i8> %a, splat (i8 1) + ret <16 x i8> %add +} diff --git a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/vector-all-ones.ll deleted file mode 100644 index e4c93adc..0000000 --- a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll +++ /dev/null @@ -1,23 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \ -; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s - -; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \ -; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s - -; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \ -; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s - -; Currently the generated code uses `vspltisw` to generate vector of 1s followed by add operation. -; This pattern is expected to be optimized in a future patch by using `xxleqv` to generate vector of -1s -; followed by subtraction operation. -define dso_local noundef <4 x i32> @test1(<4 x i32> %a) { -; CHECK-LABEL: test1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vspltisw v3, 1 -; CHECK-NEXT: vadduwm v2, v2, v3 -; CHECK-NEXT: blr -entry: - %add = add <4 x i32> %a, splat (i32 1) - ret <4 x i32> %add -} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir index 2e500d5..da7546e 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir @@ -689,8 +689,8 @@ # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_INSERT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices, 0 imm indices -# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined -# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_EXTRACT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv32.mir new file mode 100644 index 0000000..d7c0e80 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv32.mir @@ -0,0 +1,1742 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s + +--- +name: insertelement_nxv1i1_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i1_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 false + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32) + $v0 = COPY %0(<vscale x 1 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv1i1_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i1_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 true + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32) + $v0 = COPY %0(<vscale x 1 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv1i1_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: insertelement_nxv1i1_2 + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %2:_(s32) = COPY $x10 + %0:_(s1) = G_TRUNC %2(s32) + %1:_(s32) = COPY $x11 + %4:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF + %3:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32) + $v0 = COPY %3(<vscale x 1 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv2i1_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i1_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 false + %3:_(s32) = G_CONSTANT i32 1 + %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32) + $v0 = COPY %0(<vscale x 2 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv2i1_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i1_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 true + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32) + $v0 = COPY %0(<vscale x 2 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv2i1_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: insertelement_nxv2i1_2 + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %2:_(s32) = COPY $x10 + %0:_(s1) = G_TRUNC %2(s32) + %1:_(s32) = COPY $x11 + %4:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF + %3:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32) + $v0 = COPY %3(<vscale x 2 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv4i1_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i1_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 false + %3:_(s32) = G_CONSTANT i32 2 + %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32) + $v0 = COPY %0(<vscale x 4 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv4i1_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i1_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 true + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32) + $v0 = COPY %0(<vscale x 4 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv4i1_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i1_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(s32) = COPY $x10 + %0:_(s1) = G_TRUNC %1(s32) + %3:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF + %4:_(s32) = G_CONSTANT i32 0 + %2:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %3, %0(s1), %4(s32) + $v0 = COPY %2(<vscale x 4 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv8i1_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i1_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 false + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32) + $v0 = COPY %0(<vscale x 8 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv8i1_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i1_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 true + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32) + $v0 = COPY %0(<vscale x 8 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv8i1_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: insertelement_nxv8i1_2 + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %2:_(s32) = COPY $x10 + %0:_(s1) = G_TRUNC %2(s32) + %1:_(s32) = COPY $x11 + %4:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF + %3:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32) + $v0 = COPY %3(<vscale x 8 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv16i1_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i1_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 false + %3:_(s32) = G_CONSTANT i32 15 + %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32) + $v0 = COPY %0(<vscale x 16 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv16i1_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i1_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 true + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32) + $v0 = COPY %0(<vscale x 16 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv16i1_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: insertelement_nxv16i1_2 + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %2:_(s32) = COPY $x10 + %0:_(s1) = G_TRUNC %2(s32) + %1:_(s32) = COPY $x11 + %4:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF + %3:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32) + $v0 = COPY %3(<vscale x 16 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv4i1_3 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v0, $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i1_3 + ; CHECK: liveins: $v0, $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s1), [[C]](s32) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %0:_(<vscale x 4 x s1>) = COPY $v0 + %2:_(s32) = COPY $x10 + %1:_(s1) = G_TRUNC %2(s32) + %4:_(s32) = G_CONSTANT i32 0 + %3:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %0, %1(s1), %4(s32) + $v0 = COPY %3(<vscale x 4 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv1i8_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i8_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 0 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32) + $v8 = COPY %0(<vscale x 1 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i8_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i8_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32) + $v8 = COPY %0(<vscale x 1 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i8_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv1i8_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s32) = COPY $x10 + %0:_(s8) = G_TRUNC %1(s32) + %3:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF + %4:_(s32) = G_CONSTANT i32 0 + %2:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32) + $v8 = COPY %2(<vscale x 1 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i8_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i8_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 0 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32) + $v8 = COPY %0(<vscale x 2 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i8_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i8_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32) + $v8 = COPY %0(<vscale x 2 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i8_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv2i8_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s32) = COPY $x10 + %0:_(s8) = G_TRUNC %1(s32) + %3:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF + %4:_(s32) = G_CONSTANT i32 0 + %2:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32) + $v8 = COPY %2(<vscale x 2 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i8_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i8_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 0 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32) + $v8 = COPY %0(<vscale x 4 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i8_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i8_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32) + $v8 = COPY %0(<vscale x 4 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i8_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i8_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s32) = COPY $x10 + %0:_(s8) = G_TRUNC %1(s32) + %3:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF + %4:_(s32) = G_CONSTANT i32 0 + %2:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32) + $v8 = COPY %2(<vscale x 4 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv8i8_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i8_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 0 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32) + $v8 = COPY %0(<vscale x 8 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv8i8_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i8_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32) + $v8 = COPY %0(<vscale x 8 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv8i8_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv8i8_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s32) = COPY $x10 + %0:_(s8) = G_TRUNC %1(s32) + %3:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF + %4:_(s32) = G_CONSTANT i32 0 + %2:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32) + $v8 = COPY %2(<vscale x 8 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv16i8_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i8_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 0 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32) + $v8m2 = COPY %0(<vscale x 16 x s8>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv16i8_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i8_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32) + $v8m2 = COPY %0(<vscale x 16 x s8>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv16i8_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11, $x12 + + ; CHECK-LABEL: name: insertelement_nxv16i8_2 + ; CHECK: liveins: $x10, $x11, $x12 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[COPY1]](s32) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %2:_(s32) = COPY $x10 + %0:_(s8) = G_TRUNC %2(s32) + %3:_(s32) = COPY $x11 + %4:_(s32) = COPY $x12 + %1:_(s64) = G_MERGE_VALUES %3(s32), %4(s32) + %6:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF + %7:_(s32) = G_TRUNC %1(s64) + %5:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %6, %0(s8), %7(s32) + $v8m2 = COPY %5(<vscale x 16 x s8>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv4i8_3 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8, $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i8_3 + ; CHECK: liveins: $v8, $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s8), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_(<vscale x 4 x s8>) = COPY $v8 + %2:_(s32) = COPY $x10 + %1:_(s8) = G_TRUNC %2(s32) + %4:_(s32) = G_CONSTANT i32 0 + %3:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %0, %1(s8), %4(s32) + $v8 = COPY %3(<vscale x 4 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i16_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i16_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 0 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32) + $v8 = COPY %0(<vscale x 1 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i16_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i16_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32) + $v8 = COPY %0(<vscale x 1 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i16_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv1i16_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s32) = COPY $x10 + %0:_(s16) = G_TRUNC %1(s32) + %3:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF + %4:_(s32) = G_CONSTANT i32 0 + %2:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32) + $v8 = COPY %2(<vscale x 1 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i16_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i16_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 0 + %3:_(s32) = G_CONSTANT i32 1 + %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32) + $v8 = COPY %0(<vscale x 2 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i16_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i16_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32) + $v8 = COPY %0(<vscale x 2 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i16_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv2i16_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s32) = COPY $x10 + %0:_(s16) = G_TRUNC %1(s32) + %3:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF + %4:_(s32) = G_CONSTANT i32 0 + %2:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32) + $v8 = COPY %2(<vscale x 2 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i16_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i16_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 0 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32) + $v8 = COPY %0(<vscale x 4 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i16_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i16_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32) + $v8 = COPY %0(<vscale x 4 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i16_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i16_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s32) = COPY $x10 + %0:_(s16) = G_TRUNC %1(s32) + %3:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF + %4:_(s32) = G_CONSTANT i32 0 + %2:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32) + $v8 = COPY %2(<vscale x 4 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv8i16_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i16_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 0 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32) + $v8m2 = COPY %0(<vscale x 8 x s16>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv8i16_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i16_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32) + $v8m2 = COPY %0(<vscale x 8 x s16>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv8i16_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv8i16_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(s32) = COPY $x10 + %0:_(s16) = G_TRUNC %1(s32) + %3:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF + %4:_(s32) = G_CONSTANT i32 0 + %2:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32) + $v8m2 = COPY %2(<vscale x 8 x s16>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv16i16_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i16_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 0 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32) + $v8m4 = COPY %0(<vscale x 16 x s16>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv16i16_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i16_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32) + $v8m4 = COPY %0(<vscale x 16 x s16>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv16i16_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv16i16_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(s32) = COPY $x10 + %0:_(s16) = G_TRUNC %1(s32) + %3:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF + %4:_(s32) = G_CONSTANT i32 0 + %2:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32) + $v8m4 = COPY %2(<vscale x 16 x s16>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv4i16 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8, $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i16 + ; CHECK: liveins: $v8, $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s16), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_(<vscale x 4 x s16>) = COPY $v8 + %2:_(s32) = COPY $x10 + %1:_(s16) = G_TRUNC %2(s32) + %4:_(s32) = G_CONSTANT i32 0 + %3:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %0, %1(s16), %4(s32) + $v8 = COPY %3(<vscale x 4 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i32_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i32_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32) + $v8 = COPY %0(<vscale x 1 x s32>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i32_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i32_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32) + $v8 = COPY %0(<vscale x 1 x s32>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i32_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv1i32_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_(s32) = COPY $x10 + %2:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF + %3:_(s32) = G_CONSTANT i32 0 + %1:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) + $v8 = COPY %1(<vscale x 1 x s32>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i32_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i32_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32) + $v8 = COPY %0(<vscale x 2 x s32>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i32_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i32_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32) + $v8 = COPY %0(<vscale x 2 x s32>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i32_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv2i32_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_(s32) = COPY $x10 + %2:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF + %3:_(s32) = G_CONSTANT i32 0 + %1:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) + $v8 = COPY %1(<vscale x 2 x s32>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i32_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i32_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32) + $v8m2 = COPY %0(<vscale x 4 x s32>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv4i32_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i32_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32) + $v8m2 = COPY %0(<vscale x 4 x s32>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv4i32_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i32_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %0:_(s32) = COPY $x10 + %2:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF + %3:_(s32) = G_CONSTANT i32 0 + %1:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) + $v8m2 = COPY %1(<vscale x 4 x s32>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv8i32_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i32_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32) + $v8m4 = COPY %0(<vscale x 8 x s32>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv8i32_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i32_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32) + $v8m4 = COPY %0(<vscale x 8 x s32>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv8i32_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv8i32_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %0:_(s32) = COPY $x10 + %2:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF + %3:_(s32) = G_CONSTANT i32 0 + %1:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) + $v8m4 = COPY %1(<vscale x 8 x s32>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv16i32_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i32_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32) + ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32) + $v8m8 = COPY %0(<vscale x 16 x s32>) + PseudoRET implicit $v8m8 +... +--- +name: insertelement_nxv16i32_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i32_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32) + ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32) + $v8m8 = COPY %0(<vscale x 16 x s32>) + PseudoRET implicit $v8m8 +... +--- +name: insertelement_nxv16i32_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv16i32_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32) + ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %0:_(s32) = COPY $x10 + %2:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF + %3:_(s32) = G_CONSTANT i32 0 + %1:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) + $v8m8 = COPY %1(<vscale x 16 x s32>) + PseudoRET implicit $v8m8 +... +--- +name: insertelement_nxv4i32 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $v8m2 + + ; CHECK-LABEL: name: insertelement_nxv4i32 + ; CHECK: liveins: $x10, $v8m2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[C]](s32) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %0:_(<vscale x 4 x s32>) = COPY $v8m2 + %1:_(s32) = COPY $x10 + %3:_(s32) = G_CONSTANT i32 0 + %2:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %0, %1(s32), %3(s32) + $v8m2 = COPY %2(<vscale x 4 x s32>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv1i64_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i64_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 0 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32) + $v8 = COPY %0(<vscale x 1 x s64>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i64_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i64_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32) + $v8 = COPY %0(<vscale x 1 x s64>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i64_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: insertelement_nxv1i64_2 + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s32) = COPY $x10 + %2:_(s32) = COPY $x11 + %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32) + %4:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF + %5:_(s32) = G_CONSTANT i32 0 + %3:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32) + $v8 = COPY %3(<vscale x 1 x s64>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i64_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i64_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 0 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32) + $v8m2 = COPY %0(<vscale x 2 x s64>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv2i64_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i64_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32) + $v8m2 = COPY %0(<vscale x 2 x s64>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv2i64_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: insertelement_nxv2i64_2 + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(s32) = COPY $x10 + %2:_(s32) = COPY $x11 + %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32) + %4:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF + %5:_(s32) = G_CONSTANT i32 0 + %3:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32) + $v8m2 = COPY %3(<vscale x 2 x s64>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv4i64_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i64_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 0 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32) + $v8m4 = COPY %0(<vscale x 4 x s64>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv4i64_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i64_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32) + $v8m4 = COPY %0(<vscale x 4 x s64>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv4i64_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: insertelement_nxv4i64_2 + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(s32) = COPY $x10 + %2:_(s32) = COPY $x11 + %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32) + %4:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF + %5:_(s32) = G_CONSTANT i32 0 + %3:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32) + $v8m4 = COPY %3(<vscale x 4 x s64>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv8i64_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i64_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32) + ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 0 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32) + $v8m8 = COPY %0(<vscale x 8 x s64>) + PseudoRET implicit $v8m8 +... +--- +name: insertelement_nxv8i64_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i64_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32) + ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 -1 + %3:_(s32) = G_CONSTANT i32 0 + %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32) + $v8m8 = COPY %0(<vscale x 8 x s64>) + PseudoRET implicit $v8m8 +... +--- +name: insertelement_nxv8i64_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: insertelement_nxv8i64_2 + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32) + ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %1:_(s32) = COPY $x10 + %2:_(s32) = COPY $x11 + %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32) + %4:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF + %5:_(s32) = G_CONSTANT i32 0 + %3:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32) + $v8m8 = COPY %3(<vscale x 8 x s64>) + PseudoRET implicit $v8m8 +... +--- +name: insertelement_nxv4i64 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11, $v8m4 + + ; CHECK-LABEL: name: insertelement_nxv4i64 + ; CHECK: liveins: $x10, $x11, $v8m4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s64>) = COPY $v8m4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x11 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[MV]](s64), [[C]](s32) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %0:_(<vscale x 4 x s64>) = COPY $v8m4 + %2:_(s32) = COPY $x10 + %3:_(s32) = COPY $x11 + %1:_(s64) = G_MERGE_VALUES %2(s32), %3(s32) + %5:_(s32) = G_CONSTANT i32 0 + %4:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %0, %1(s64), %5(s32) + $v8m4 = COPY %4(<vscale x 4 x s64>) + PseudoRET implicit $v8m4 +... diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv64.mir new file mode 100644 index 0000000..4c33ddc --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv64.mir @@ -0,0 +1,1731 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s + +--- +name: insertelement_nxv1i1_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i1_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 false + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64) + $v0 = COPY %0(<vscale x 1 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv1i1_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i1_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 true + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64) + $v0 = COPY %0(<vscale x 1 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv1i1_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: insertelement_nxv1i1_2 + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %2:_(s64) = COPY $x10 + %0:_(s1) = G_TRUNC %2(s64) + %3:_(s64) = COPY $x11 + %1:_(s32) = G_TRUNC %3(s64) + %5:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF + %6:_(s64) = G_ZEXT %1(s32) + %4:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64) + $v0 = COPY %4(<vscale x 1 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv2i1_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i1_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 false + %3:_(s64) = G_CONSTANT i64 1 + %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64) + $v0 = COPY %0(<vscale x 2 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv2i1_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i1_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 true + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64) + $v0 = COPY %0(<vscale x 2 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv2i1_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: insertelement_nxv2i1_2 + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %2:_(s64) = COPY $x10 + %0:_(s1) = G_TRUNC %2(s64) + %3:_(s64) = COPY $x11 + %1:_(s32) = G_TRUNC %3(s64) + %5:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF + %6:_(s64) = G_ZEXT %1(s32) + %4:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64) + $v0 = COPY %4(<vscale x 2 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv4i1_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i1_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 false + %3:_(s64) = G_CONSTANT i64 2 + %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64) + $v0 = COPY %0(<vscale x 4 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv4i1_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i1_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 true + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64) + $v0 = COPY %0(<vscale x 4 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv4i1_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i1_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(s64) = COPY $x10 + %0:_(s1) = G_TRUNC %1(s64) + %3:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %3, %0(s1), %4(s64) + $v0 = COPY %2(<vscale x 4 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv8i1_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i1_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 false + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64) + $v0 = COPY %0(<vscale x 8 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv8i1_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i1_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 true + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64) + $v0 = COPY %0(<vscale x 8 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv8i1_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: insertelement_nxv8i1_2 + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %2:_(s64) = COPY $x10 + %0:_(s1) = G_TRUNC %2(s64) + %3:_(s64) = COPY $x11 + %1:_(s32) = G_TRUNC %3(s64) + %5:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF + %6:_(s64) = G_ZEXT %1(s32) + %4:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64) + $v0 = COPY %4(<vscale x 8 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv16i1_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i1_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 false + %3:_(s64) = G_CONSTANT i64 15 + %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64) + $v0 = COPY %0(<vscale x 16 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv16i1_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i1_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF + %2:_(s1) = G_CONSTANT i1 true + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64) + $v0 = COPY %0(<vscale x 16 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv16i1_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: insertelement_nxv16i1_2 + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %2:_(s64) = COPY $x10 + %0:_(s1) = G_TRUNC %2(s64) + %3:_(s64) = COPY $x11 + %1:_(s32) = G_TRUNC %3(s64) + %5:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF + %6:_(s64) = G_ZEXT %1(s32) + %4:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64) + $v0 = COPY %4(<vscale x 16 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv4i1_3 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v0, $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i1_3 + ; CHECK: liveins: $v0, $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s1), [[C]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>) + ; CHECK-NEXT: PseudoRET implicit $v0 + %0:_(<vscale x 4 x s1>) = COPY $v0 + %2:_(s64) = COPY $x10 + %1:_(s1) = G_TRUNC %2(s64) + %4:_(s64) = G_CONSTANT i64 0 + %3:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %0, %1(s1), %4(s64) + $v0 = COPY %3(<vscale x 4 x s1>) + PseudoRET implicit $v0 +... +--- +name: insertelement_nxv1i8_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i8_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64) + $v8 = COPY %0(<vscale x 1 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i8_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i8_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64) + $v8 = COPY %0(<vscale x 1 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i8_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv1i8_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s64) = COPY $x10 + %0:_(s8) = G_TRUNC %1(s64) + %3:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64) + $v8 = COPY %2(<vscale x 1 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i8_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i8_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64) + $v8 = COPY %0(<vscale x 2 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i8_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i8_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64) + $v8 = COPY %0(<vscale x 2 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i8_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv2i8_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s64) = COPY $x10 + %0:_(s8) = G_TRUNC %1(s64) + %3:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64) + $v8 = COPY %2(<vscale x 2 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i8_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i8_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64) + $v8 = COPY %0(<vscale x 4 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i8_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i8_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64) + $v8 = COPY %0(<vscale x 4 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i8_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i8_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s64) = COPY $x10 + %0:_(s8) = G_TRUNC %1(s64) + %3:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64) + $v8 = COPY %2(<vscale x 4 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv8i8_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i8_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64) + $v8 = COPY %0(<vscale x 8 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv8i8_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i8_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64) + $v8 = COPY %0(<vscale x 8 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv8i8_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv8i8_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s64) = COPY $x10 + %0:_(s8) = G_TRUNC %1(s64) + %3:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64) + $v8 = COPY %2(<vscale x 8 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv16i8_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i8_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64) + $v8m2 = COPY %0(<vscale x 16 x s8>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv16i8_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i8_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF + %2:_(s8) = G_CONSTANT i8 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64) + $v8m2 = COPY %0(<vscale x 16 x s8>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv16i8_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: insertelement_nxv16i8_2 + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[COPY1]](s64) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %2:_(s64) = COPY $x10 + %0:_(s8) = G_TRUNC %2(s64) + %1:_(s64) = COPY $x11 + %4:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF + %3:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %4, %0(s8), %1(s64) + $v8m2 = COPY %3(<vscale x 16 x s8>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv4i8_3 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8, $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i8_3 + ; CHECK: liveins: $v8, $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s8), [[C]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_(<vscale x 4 x s8>) = COPY $v8 + %2:_(s64) = COPY $x10 + %1:_(s8) = G_TRUNC %2(s64) + %4:_(s64) = G_CONSTANT i64 0 + %3:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %0, %1(s8), %4(s64) + $v8 = COPY %3(<vscale x 4 x s8>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i16_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i16_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64) + $v8 = COPY %0(<vscale x 1 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i16_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i16_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64) + $v8 = COPY %0(<vscale x 1 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i16_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv1i16_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s64) = COPY $x10 + %0:_(s16) = G_TRUNC %1(s64) + %3:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64) + $v8 = COPY %2(<vscale x 1 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i16_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i16_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 0 + %3:_(s64) = G_CONSTANT i64 1 + %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64) + $v8 = COPY %0(<vscale x 2 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i16_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i16_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64) + $v8 = COPY %0(<vscale x 2 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i16_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv2i16_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s64) = COPY $x10 + %0:_(s16) = G_TRUNC %1(s64) + %3:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64) + $v8 = COPY %2(<vscale x 2 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i16_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i16_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64) + $v8 = COPY %0(<vscale x 4 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i16_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i16_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64) + $v8 = COPY %0(<vscale x 4 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i16_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i16_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s64) = COPY $x10 + %0:_(s16) = G_TRUNC %1(s64) + %3:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64) + $v8 = COPY %2(<vscale x 4 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv8i16_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i16_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64) + $v8m2 = COPY %0(<vscale x 8 x s16>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv8i16_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i16_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64) + $v8m2 = COPY %0(<vscale x 8 x s16>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv8i16_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv8i16_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(s64) = COPY $x10 + %0:_(s16) = G_TRUNC %1(s64) + %3:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64) + $v8m2 = COPY %2(<vscale x 8 x s16>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv16i16_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i16_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64) + $v8m4 = COPY %0(<vscale x 16 x s16>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv16i16_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i16_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF + %2:_(s16) = G_CONSTANT i16 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64) + $v8m4 = COPY %0(<vscale x 16 x s16>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv16i16_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv16i16_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(s64) = COPY $x10 + %0:_(s16) = G_TRUNC %1(s64) + %3:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64) + $v8m4 = COPY %2(<vscale x 16 x s16>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv4i16 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8, $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i16 + ; CHECK: liveins: $v8, $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s16), [[C]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_(<vscale x 4 x s16>) = COPY $v8 + %2:_(s64) = COPY $x10 + %1:_(s16) = G_TRUNC %2(s64) + %4:_(s64) = G_CONSTANT i64 0 + %3:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %0, %1(s16), %4(s64) + $v8 = COPY %3(<vscale x 4 x s16>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i32_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i32_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64) + $v8 = COPY %0(<vscale x 1 x s32>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i32_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i32_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64) + $v8 = COPY %0(<vscale x 1 x s32>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i32_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv1i32_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s64) = COPY $x10 + %0:_(s32) = G_TRUNC %1(s64) + %3:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64) + $v8 = COPY %2(<vscale x 1 x s32>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i32_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i32_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64) + $v8 = COPY %0(<vscale x 2 x s32>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i32_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i32_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64) + $v8 = COPY %0(<vscale x 2 x s32>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i32_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv2i32_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(s64) = COPY $x10 + %0:_(s32) = G_TRUNC %1(s64) + %3:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64) + $v8 = COPY %2(<vscale x 2 x s32>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv4i32_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i32_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64) + $v8m2 = COPY %0(<vscale x 4 x s32>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv4i32_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i32_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64) + $v8m2 = COPY %0(<vscale x 4 x s32>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv4i32_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i32_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(s64) = COPY $x10 + %0:_(s32) = G_TRUNC %1(s64) + %3:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64) + $v8m2 = COPY %2(<vscale x 4 x s32>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv8i32_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i32_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64) + $v8m4 = COPY %0(<vscale x 8 x s32>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv8i32_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i32_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64) + $v8m4 = COPY %0(<vscale x 8 x s32>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv8i32_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv8i32_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(s64) = COPY $x10 + %0:_(s32) = G_TRUNC %1(s64) + %3:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64) + $v8m4 = COPY %2(<vscale x 8 x s32>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv16i32_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i32_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64) + ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 0 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64) + $v8m8 = COPY %0(<vscale x 16 x s32>) + PseudoRET implicit $v8m8 +... +--- +name: insertelement_nxv16i32_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv16i32_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64) + ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF + %2:_(s32) = G_CONSTANT i32 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64) + $v8m8 = COPY %0(<vscale x 16 x s32>) + PseudoRET implicit $v8m8 +... +--- +name: insertelement_nxv16i32_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv16i32_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64) + ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %1:_(s64) = COPY $x10 + %0:_(s32) = G_TRUNC %1(s64) + %3:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF + %4:_(s64) = G_CONSTANT i64 0 + %2:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64) + $v8m8 = COPY %2(<vscale x 16 x s32>) + PseudoRET implicit $v8m8 +... +--- +name: insertelement_nxv4i32 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $v8m2 + + ; CHECK-LABEL: name: insertelement_nxv4i32 + ; CHECK: liveins: $x10, $v8m2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s32), [[C]](s64) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %0:_(<vscale x 4 x s32>) = COPY $v8m2 + %2:_(s64) = COPY $x10 + %1:_(s32) = G_TRUNC %2(s64) + %4:_(s64) = G_CONSTANT i64 0 + %3:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %0, %1(s32), %4(s64) + $v8m2 = COPY %3(<vscale x 4 x s32>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv1i64_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i64_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64) + $v8 = COPY %0(<vscale x 1 x s64>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i64_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv1i64_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64) + $v8 = COPY %0(<vscale x 1 x s64>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv1i64_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv1i64_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64) + ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_(s64) = COPY $x10 + %2:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF + %3:_(s64) = G_CONSTANT i64 0 + %1:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64) + $v8 = COPY %1(<vscale x 1 x s64>) + PseudoRET implicit $v8 +... +--- +name: insertelement_nxv2i64_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i64_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64) + $v8m2 = COPY %0(<vscale x 2 x s64>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv2i64_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv2i64_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64) + $v8m2 = COPY %0(<vscale x 2 x s64>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv2i64_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv2i64_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64) + ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m2 + %0:_(s64) = COPY $x10 + %2:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF + %3:_(s64) = G_CONSTANT i64 0 + %1:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64) + $v8m2 = COPY %1(<vscale x 2 x s64>) + PseudoRET implicit $v8m2 +... +--- +name: insertelement_nxv4i64_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i64_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64) + $v8m4 = COPY %0(<vscale x 4 x s64>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv4i64_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv4i64_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64) + $v8m4 = COPY %0(<vscale x 4 x s64>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv4i64_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv4i64_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64) + ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %0:_(s64) = COPY $x10 + %2:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF + %3:_(s64) = G_CONSTANT i64 0 + %1:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64) + $v8m4 = COPY %1(<vscale x 4 x s64>) + PseudoRET implicit $v8m4 +... +--- +name: insertelement_nxv8i64_0 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i64_0 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64) + ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64) + $v8m8 = COPY %0(<vscale x 8 x s64>) + PseudoRET implicit $v8m8 +... +--- +name: insertelement_nxv8i64_1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insertelement_nxv8i64_1 + ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64) + ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF + %2:_(s64) = G_CONSTANT i64 -1 + %3:_(s64) = G_CONSTANT i64 0 + %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64) + $v8m8 = COPY %0(<vscale x 8 x s64>) + PseudoRET implicit $v8m8 +... +--- +name: insertelement_nxv8i64_2 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; CHECK-LABEL: name: insertelement_nxv8i64_2 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64) + ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>) + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %0:_(s64) = COPY $x10 + %2:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF + %3:_(s64) = G_CONSTANT i64 0 + %1:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64) + $v8m8 = COPY %1(<vscale x 8 x s64>) + PseudoRET implicit $v8m8 +... diff --git a/llvm/test/CodeGen/RISCV/branch-rel.mir b/llvm/test/CodeGen/RISCV/branch-rel.mir new file mode 100644 index 0000000..1ed5f57 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/branch-rel.mir @@ -0,0 +1,39 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc %s -mtriple=riscv64 -run-pass=branch-relaxation -o - -verify-machineinstrs | FileCheck %s + +--- | + define void @foo() { + ret void + } +... +--- +name: foo +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: foo + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: PseudoBR %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: INLINEASM &".space 4096", 1 /* sideeffect attdialect */ + ; CHECK-NEXT: BGE $x1, $x0, %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET + bb.0: + liveins: $x1 + BNE $x1, $x0, %bb.3 + PseudoBR %bb.3 + bb.1: + liveins: $x1 + INLINEASM &".space 4096", 1 + BGE $x1, $x0, %bb.3 + bb.3: + PseudoRET +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: diff --git a/llvm/test/CodeGen/RISCV/idiv_large.ll b/llvm/test/CodeGen/RISCV/idiv_large.ll index 9937627..d7b00f6 100644 --- a/llvm/test/CodeGen/RISCV/idiv_large.ll +++ b/llvm/test/CodeGen/RISCV/idiv_large.ll @@ -1,16 +1,2315 @@ -; RUN: llc -mtriple=riscv32 < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=riscv32 < %s | FileCheck %s --check-prefix=RV32 +; RUN: llc -mtriple=riscv64 < %s | FileCheck %s --check-prefix=RV64 + +define i64 @udiv_i64(i64 %x, i64 %y) nounwind { +; RV32-LABEL: udiv_i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: call __udivdi3 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: udiv_i64: +; RV64: # %bb.0: +; RV64-NEXT: tail __udivdi3 + %res = udiv i64 %x, %y + ret i64 %res +} + +define i65 @udiv_i65(i65 %x, i65 %y) nounwind { +; RV32-LABEL: udiv_i65: +; RV32: # %bb.0: # %_udiv-special-cases +; RV32-NEXT: lw a3, 0(a2) +; RV32-NEXT: lw a4, 4(a2) +; RV32-NEXT: lw t1, 8(a2) +; RV32-NEXT: lui a2, 349525 +; RV32-NEXT: lui a5, 209715 +; RV32-NEXT: lui a6, 61681 +; RV32-NEXT: addi t0, a2, 1365 +; RV32-NEXT: addi a7, a5, 819 +; RV32-NEXT: addi a6, a6, -241 +; RV32-NEXT: srli a2, a4, 1 +; RV32-NEXT: slli a5, t1, 31 +; RV32-NEXT: slli t3, a4, 31 +; RV32-NEXT: or t2, a5, a2 +; RV32-NEXT: srli a2, a3, 1 +; RV32-NEXT: or t4, a2, t3 +; RV32-NEXT: bnez t2, .LBB1_2 +; RV32-NEXT: # %bb.1: # %_udiv-special-cases +; RV32-NEXT: srli a2, t4, 1 +; RV32-NEXT: or a2, t4, a2 +; RV32-NEXT: srli a5, a2, 2 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 4 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 8 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 16 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: srli a5, a2, 1 +; RV32-NEXT: and a5, a5, t0 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: and a5, a2, a7 +; RV32-NEXT: srli a2, a2, 2 +; RV32-NEXT: and a2, a2, a7 +; RV32-NEXT: add a2, a5, a2 +; RV32-NEXT: srli a5, a2, 4 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: and a2, a2, a6 +; RV32-NEXT: slli a5, a2, 8 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: slli a5, a2, 16 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: srli a2, a2, 24 +; RV32-NEXT: addi t3, a2, 32 +; RV32-NEXT: j .LBB1_3 +; RV32-NEXT: .LBB1_2: +; RV32-NEXT: srli a2, t2, 1 +; RV32-NEXT: or a2, t2, a2 +; RV32-NEXT: srli a5, a2, 2 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 4 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 8 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 16 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: srli a5, a2, 1 +; RV32-NEXT: and a5, a5, t0 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: and a5, a2, a7 +; RV32-NEXT: srli a2, a2, 2 +; RV32-NEXT: and a2, a2, a7 +; RV32-NEXT: add a2, a5, a2 +; RV32-NEXT: srli a5, a2, 4 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: and a2, a2, a6 +; RV32-NEXT: slli a5, a2, 8 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: slli a5, a2, 16 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: srli t3, a2, 24 +; RV32-NEXT: .LBB1_3: # %_udiv-special-cases +; RV32-NEXT: addi sp, sp, -96 +; RV32-NEXT: sw s0, 92(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 88(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 84(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 80(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 68(sp) # 4-byte Folded Spill +; RV32-NEXT: slli a2, a3, 31 +; RV32-NEXT: li t5, 64 +; RV32-NEXT: bnez a2, .LBB1_5 +; RV32-NEXT: # %bb.4: # %_udiv-special-cases +; RV32-NEXT: li s0, 64 +; RV32-NEXT: j .LBB1_6 +; RV32-NEXT: .LBB1_5: +; RV32-NEXT: srli a5, a2, 1 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 2 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 4 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 8 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: srli a5, a2, 16 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: srli a5, a2, 1 +; RV32-NEXT: and a5, a5, t0 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: and a5, a2, a7 +; RV32-NEXT: srli a2, a2, 2 +; RV32-NEXT: and a2, a2, a7 +; RV32-NEXT: add a2, a5, a2 +; RV32-NEXT: srli a5, a2, 4 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: and a2, a2, a6 +; RV32-NEXT: slli a5, a2, 8 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: slli a5, a2, 16 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: srli s0, a2, 24 +; RV32-NEXT: .LBB1_6: # %_udiv-special-cases +; RV32-NEXT: lw a5, 0(a1) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw s2, 8(a1) +; RV32-NEXT: or a1, t4, t2 +; RV32-NEXT: addi s1, s0, 64 +; RV32-NEXT: bnez a1, .LBB1_8 +; RV32-NEXT: # %bb.7: # %_udiv-special-cases +; RV32-NEXT: mv t3, s1 +; RV32-NEXT: .LBB1_8: # %_udiv-special-cases +; RV32-NEXT: snez s4, a1 +; RV32-NEXT: srli a1, a2, 1 +; RV32-NEXT: slli t2, s2, 31 +; RV32-NEXT: slli t4, a2, 31 +; RV32-NEXT: or a1, t2, a1 +; RV32-NEXT: srli t2, a5, 1 +; RV32-NEXT: or t6, t2, t4 +; RV32-NEXT: bnez a1, .LBB1_10 +; RV32-NEXT: # %bb.9: # %_udiv-special-cases +; RV32-NEXT: srli t2, t6, 1 +; RV32-NEXT: or t2, t6, t2 +; RV32-NEXT: srli t4, t2, 2 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: srli t4, t2, 4 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: srli t4, t2, 8 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: srli t4, t2, 16 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: not t2, t2 +; RV32-NEXT: srli t4, t2, 1 +; RV32-NEXT: and t4, t4, t0 +; RV32-NEXT: sub t2, t2, t4 +; RV32-NEXT: and t4, t2, a7 +; RV32-NEXT: srli t2, t2, 2 +; RV32-NEXT: and t2, t2, a7 +; RV32-NEXT: add t2, t4, t2 +; RV32-NEXT: srli t4, t2, 4 +; RV32-NEXT: add t2, t2, t4 +; RV32-NEXT: and t2, t2, a6 +; RV32-NEXT: slli t4, t2, 8 +; RV32-NEXT: add t2, t2, t4 +; RV32-NEXT: slli t4, t2, 16 +; RV32-NEXT: add t2, t2, t4 +; RV32-NEXT: srli t2, t2, 24 +; RV32-NEXT: addi s3, t2, 32 +; RV32-NEXT: j .LBB1_11 +; RV32-NEXT: .LBB1_10: +; RV32-NEXT: srli t2, a1, 1 +; RV32-NEXT: or t2, a1, t2 +; RV32-NEXT: srli t4, t2, 2 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: srli t4, t2, 4 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: srli t4, t2, 8 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: srli t4, t2, 16 +; RV32-NEXT: or t2, t2, t4 +; RV32-NEXT: not t2, t2 +; RV32-NEXT: srli t4, t2, 1 +; RV32-NEXT: and t4, t4, t0 +; RV32-NEXT: sub t2, t2, t4 +; RV32-NEXT: and t4, t2, a7 +; RV32-NEXT: srli t2, t2, 2 +; RV32-NEXT: and t2, t2, a7 +; RV32-NEXT: add t2, t4, t2 +; RV32-NEXT: srli t4, t2, 4 +; RV32-NEXT: add t2, t2, t4 +; RV32-NEXT: and t2, t2, a6 +; RV32-NEXT: slli t4, t2, 8 +; RV32-NEXT: add t2, t2, t4 +; RV32-NEXT: slli t4, t2, 16 +; RV32-NEXT: add t2, t2, t4 +; RV32-NEXT: srli s3, t2, 24 +; RV32-NEXT: .LBB1_11: # %_udiv-special-cases +; RV32-NEXT: andi t4, s2, 1 +; RV32-NEXT: andi t1, t1, 1 +; RV32-NEXT: or t2, a3, a4 +; RV32-NEXT: or s2, a5, a2 +; RV32-NEXT: sltu s0, s1, s0 +; RV32-NEXT: slli s1, a5, 31 +; RV32-NEXT: addi s4, s4, -1 +; RV32-NEXT: beqz s1, .LBB1_13 +; RV32-NEXT: # %bb.12: +; RV32-NEXT: srli t5, s1, 1 +; RV32-NEXT: or t5, s1, t5 +; RV32-NEXT: srli s1, t5, 2 +; RV32-NEXT: or t5, t5, s1 +; RV32-NEXT: srli s1, t5, 4 +; RV32-NEXT: or t5, t5, s1 +; RV32-NEXT: srli s1, t5, 8 +; RV32-NEXT: or t5, t5, s1 +; RV32-NEXT: srli s1, t5, 16 +; RV32-NEXT: or t5, t5, s1 +; RV32-NEXT: not t5, t5 +; RV32-NEXT: srli s1, t5, 1 +; RV32-NEXT: and t0, s1, t0 +; RV32-NEXT: sub t0, t5, t0 +; RV32-NEXT: and t5, t0, a7 +; RV32-NEXT: srli t0, t0, 2 +; RV32-NEXT: and a7, t0, a7 +; RV32-NEXT: add a7, t5, a7 +; RV32-NEXT: srli t0, a7, 4 +; RV32-NEXT: add a7, a7, t0 +; RV32-NEXT: and a6, a7, a6 +; RV32-NEXT: slli a7, a6, 8 +; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: slli a7, a6, 16 +; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: srli t5, a6, 24 +; RV32-NEXT: .LBB1_13: # %_udiv-special-cases +; RV32-NEXT: or t0, t2, t1 +; RV32-NEXT: or a6, s2, t4 +; RV32-NEXT: and a7, s4, s0 +; RV32-NEXT: or t6, t6, a1 +; RV32-NEXT: addi s0, t5, 64 +; RV32-NEXT: bnez t6, .LBB1_15 +; RV32-NEXT: # %bb.14: # %_udiv-special-cases +; RV32-NEXT: mv s3, s0 +; RV32-NEXT: .LBB1_15: # %_udiv-special-cases +; RV32-NEXT: seqz a1, t0 +; RV32-NEXT: sltu t0, s0, t5 +; RV32-NEXT: snez t5, t6 +; RV32-NEXT: addi t5, t5, -1 +; RV32-NEXT: and t0, t5, t0 +; RV32-NEXT: sltu t5, t3, s3 +; RV32-NEXT: seqz a6, a6 +; RV32-NEXT: mv t6, t5 +; RV32-NEXT: beq a7, t0, .LBB1_17 +; RV32-NEXT: # %bb.16: # %_udiv-special-cases +; RV32-NEXT: sltu t6, a7, t0 +; RV32-NEXT: .LBB1_17: # %_udiv-special-cases +; RV32-NEXT: or a1, a1, a6 +; RV32-NEXT: andi a6, t6, 1 +; RV32-NEXT: sub a7, a7, t0 +; RV32-NEXT: sub t5, a7, t5 +; RV32-NEXT: sub a7, t3, s3 +; RV32-NEXT: beqz a6, .LBB1_19 +; RV32-NEXT: # %bb.18: # %_udiv-special-cases +; RV32-NEXT: mv t0, a6 +; RV32-NEXT: j .LBB1_20 +; RV32-NEXT: .LBB1_19: +; RV32-NEXT: sltiu t0, a7, 65 +; RV32-NEXT: xori t0, t0, 1 +; RV32-NEXT: snez t3, t5 +; RV32-NEXT: or t0, t0, t3 +; RV32-NEXT: .LBB1_20: # %_udiv-special-cases +; RV32-NEXT: or t6, a1, t0 +; RV32-NEXT: addi a1, t6, -1 +; RV32-NEXT: and t3, t4, a1 +; RV32-NEXT: and t0, a1, a2 +; RV32-NEXT: and a1, a1, a5 +; RV32-NEXT: bnez t6, .LBB1_30 +; RV32-NEXT: # %bb.21: # %_udiv-special-cases +; RV32-NEXT: xori t6, a7, 64 +; RV32-NEXT: or t6, t6, a6 +; RV32-NEXT: or t6, t6, t5 +; RV32-NEXT: beqz t6, .LBB1_30 +; RV32-NEXT: # %bb.22: # %udiv-bb1 +; RV32-NEXT: addi a1, a7, 1 +; RV32-NEXT: sw zero, 32(sp) +; RV32-NEXT: sw zero, 36(sp) +; RV32-NEXT: sw zero, 40(sp) +; RV32-NEXT: sw zero, 44(sp) +; RV32-NEXT: sw a5, 48(sp) +; RV32-NEXT: sw a2, 52(sp) +; RV32-NEXT: sw t4, 56(sp) +; RV32-NEXT: li t0, 64 +; RV32-NEXT: addi t3, sp, 48 +; RV32-NEXT: neg s1, a7 +; RV32-NEXT: seqz t6, a1 +; RV32-NEXT: sub a7, t0, a7 +; RV32-NEXT: add t5, t5, t6 +; RV32-NEXT: andi t0, a7, 31 +; RV32-NEXT: srli a7, a7, 3 +; RV32-NEXT: or t6, a1, t5 +; RV32-NEXT: xori s2, t0, 31 +; RV32-NEXT: andi a7, a7, 12 +; RV32-NEXT: seqz t0, t6 +; RV32-NEXT: sub s3, t3, a7 +; RV32-NEXT: add a6, a6, t0 +; RV32-NEXT: lw t3, 0(s3) +; RV32-NEXT: lw s4, 4(s3) +; RV32-NEXT: andi a7, a6, 1 +; RV32-NEXT: or t6, t6, a7 +; RV32-NEXT: srli a6, t3, 1 +; RV32-NEXT: sll t0, s4, s1 +; RV32-NEXT: srl a6, a6, s2 +; RV32-NEXT: or t0, t0, a6 +; RV32-NEXT: sll a6, t3, s1 +; RV32-NEXT: li t3, 0 +; RV32-NEXT: beqz t6, .LBB1_28 +; RV32-NEXT: # %bb.23: # %udiv-preheader +; RV32-NEXT: li t6, 0 +; RV32-NEXT: li s0, 0 +; RV32-NEXT: srli s4, s4, 1 +; RV32-NEXT: lw s3, 8(s3) +; RV32-NEXT: sw zero, 16(sp) +; RV32-NEXT: sw zero, 20(sp) +; RV32-NEXT: sw zero, 24(sp) +; RV32-NEXT: sw zero, 28(sp) +; RV32-NEXT: sw a5, 0(sp) +; RV32-NEXT: sw a2, 4(sp) +; RV32-NEXT: sw t4, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: srli a2, a1, 3 +; RV32-NEXT: srl a5, s4, s2 +; RV32-NEXT: mv t4, sp +; RV32-NEXT: snez t2, t2 +; RV32-NEXT: andi a2, a2, 12 +; RV32-NEXT: add t1, t1, t2 +; RV32-NEXT: add a2, t4, a2 +; RV32-NEXT: lw t2, 0(a2) +; RV32-NEXT: lw t4, 4(a2) +; RV32-NEXT: lw a2, 8(a2) +; RV32-NEXT: sll s1, s3, s1 +; RV32-NEXT: andi s2, a1, 31 +; RV32-NEXT: xori s2, s2, 31 +; RV32-NEXT: or s3, s1, a5 +; RV32-NEXT: slli a2, a2, 1 +; RV32-NEXT: slli a5, t4, 1 +; RV32-NEXT: sll a2, a2, s2 +; RV32-NEXT: sll s2, a5, s2 +; RV32-NEXT: srl s1, t4, a1 +; RV32-NEXT: or s1, s1, a2 +; RV32-NEXT: seqz a2, a3 +; RV32-NEXT: sub a2, a4, a2 +; RV32-NEXT: addi a5, t1, 1 +; RV32-NEXT: andi a5, a5, 1 +; RV32-NEXT: andi s3, s3, 1 +; RV32-NEXT: srl t1, t2, a1 +; RV32-NEXT: or s2, t1, s2 +; RV32-NEXT: addi t1, a3, -1 +; RV32-NEXT: j .LBB1_26 +; RV32-NEXT: .LBB1_24: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB1_26 Depth=1 +; RV32-NEXT: sltu t2, a2, s4 +; RV32-NEXT: .LBB1_25: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB1_26 Depth=1 +; RV32-NEXT: srli s1, s1, 31 +; RV32-NEXT: sub t4, a5, s1 +; RV32-NEXT: sub t2, t4, t2 +; RV32-NEXT: slli t2, t2, 31 +; RV32-NEXT: srai s1, t2, 31 +; RV32-NEXT: and s3, s1, a4 +; RV32-NEXT: li t2, 0 +; RV32-NEXT: li t4, 0 +; RV32-NEXT: srli s5, a6, 31 +; RV32-NEXT: sub s4, s4, s3 +; RV32-NEXT: slli s3, t0, 1 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli t0, t0, 31 +; RV32-NEXT: slli a6, a6, 1 +; RV32-NEXT: or a6, t3, a6 +; RV32-NEXT: seqz t3, a1 +; RV32-NEXT: or s0, s0, t0 +; RV32-NEXT: or s5, a1, t5 +; RV32-NEXT: sub t5, t5, t3 +; RV32-NEXT: and s6, s1, a3 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: andi t3, s1, 1 +; RV32-NEXT: or t0, t6, s3 +; RV32-NEXT: sltu t6, s2, s6 +; RV32-NEXT: snez s5, s5 +; RV32-NEXT: andi s3, s0, 1 +; RV32-NEXT: sub s1, s4, t6 +; RV32-NEXT: add a7, a7, s5 +; RV32-NEXT: addi a7, a7, 1 +; RV32-NEXT: andi a7, a7, 1 +; RV32-NEXT: or t6, a1, t5 +; RV32-NEXT: or s4, t6, a7 +; RV32-NEXT: sub s2, s2, s6 +; RV32-NEXT: li t6, 0 +; RV32-NEXT: li s0, 0 +; RV32-NEXT: beqz s4, .LBB1_29 +; RV32-NEXT: .LBB1_26: # %udiv-do-while +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: srli t2, s2, 31 +; RV32-NEXT: slli t4, s1, 1 +; RV32-NEXT: slli s2, s2, 1 +; RV32-NEXT: or s4, t4, t2 +; RV32-NEXT: andi t2, s3, 1 +; RV32-NEXT: or s2, s2, t2 +; RV32-NEXT: bne a2, s4, .LBB1_24 +; RV32-NEXT: # %bb.27: # in Loop: Header=BB1_26 Depth=1 +; RV32-NEXT: sltu t2, t1, s2 +; RV32-NEXT: j .LBB1_25 +; RV32-NEXT: .LBB1_28: +; RV32-NEXT: li t2, 0 +; RV32-NEXT: li t4, 0 +; RV32-NEXT: .LBB1_29: # %udiv-loop-exit +; RV32-NEXT: srli a2, a6, 31 +; RV32-NEXT: slli a3, t0, 1 +; RV32-NEXT: srli a4, t0, 31 +; RV32-NEXT: slli a6, a6, 1 +; RV32-NEXT: or a1, t3, a6 +; RV32-NEXT: or a2, t2, a2 +; RV32-NEXT: or a4, t4, a4 +; RV32-NEXT: or t0, a2, a3 +; RV32-NEXT: andi t3, a4, 1 +; RV32-NEXT: .LBB1_30: # %udiv-end +; RV32-NEXT: andi a2, t3, 1 +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: sw t0, 4(a0) +; RV32-NEXT: sb a2, 8(a0) +; RV32-NEXT: lw s0, 92(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 88(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 84(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 80(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 68(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 96 +; RV32-NEXT: ret +; +; RV64-LABEL: udiv_i65: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: andi a3, a3, 1 +; RV64-NEXT: call __udivti3 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %res = udiv i65 %x, %y + ret i65 %res +} define i128 @udiv_i128(i128 %x, i128 %y) nounwind { -; CHECK-LABEL: udiv_i128: -; CHECK: call __udivti3 +; RV32-LABEL: udiv_i128: +; RV32: # %bb.0: # %_udiv-special-cases +; RV32-NEXT: addi sp, sp, -160 +; RV32-NEXT: sw ra, 156(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 152(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 148(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 144(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 140(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 136(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 132(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 128(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 124(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 120(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill +; RV32-NEXT: mv s7, a0 +; RV32-NEXT: lw s8, 0(a2) +; RV32-NEXT: lw s9, 4(a2) +; RV32-NEXT: lw s11, 8(a2) +; RV32-NEXT: lw ra, 12(a2) +; RV32-NEXT: lui t4, 349525 +; RV32-NEXT: addi t4, t4, 1365 +; RV32-NEXT: lui t3, 209715 +; RV32-NEXT: addi t3, t3, 819 +; RV32-NEXT: lui t2, 61681 +; RV32-NEXT: addi t2, t2, -241 +; RV32-NEXT: bnez s9, .LBB2_2 +; RV32-NEXT: # %bb.1: # %_udiv-special-cases +; RV32-NEXT: srli a0, s8, 1 +; RV32-NEXT: or a0, s8, a0 +; RV32-NEXT: srli a3, a0, 2 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 8 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 16 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: srli a3, a0, 1 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: sub a0, a0, a3 +; RV32-NEXT: and a3, a0, t3 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: and a0, a0, t3 +; RV32-NEXT: add a0, a3, a0 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: and a0, a0, t2 +; RV32-NEXT: slli a3, a0, 8 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: slli a3, a0, 16 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: addi t6, a0, 32 +; RV32-NEXT: j .LBB2_3 +; RV32-NEXT: .LBB2_2: +; RV32-NEXT: srli a0, s9, 1 +; RV32-NEXT: or a0, s9, a0 +; RV32-NEXT: srli a3, a0, 2 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 8 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 16 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: srli a3, a0, 1 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: sub a0, a0, a3 +; RV32-NEXT: and a3, a0, t3 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: and a0, a0, t3 +; RV32-NEXT: add a0, a3, a0 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: and a0, a0, t2 +; RV32-NEXT: slli a3, a0, 8 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: slli a3, a0, 16 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: srli t6, a0, 24 +; RV32-NEXT: .LBB2_3: # %_udiv-special-cases +; RV32-NEXT: lw a6, 4(a1) +; RV32-NEXT: or s0, s11, ra +; RV32-NEXT: bnez ra, .LBB2_5 +; RV32-NEXT: # %bb.4: # %_udiv-special-cases +; RV32-NEXT: srli a0, s11, 1 +; RV32-NEXT: or a0, s11, a0 +; RV32-NEXT: srli a3, a0, 2 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 8 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 16 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: srli a3, a0, 1 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: sub a0, a0, a3 +; RV32-NEXT: and a3, a0, t3 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: and a0, a0, t3 +; RV32-NEXT: add a0, a3, a0 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: and a0, a0, t2 +; RV32-NEXT: slli a3, a0, 8 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: slli a3, a0, 16 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: addi t5, a0, 32 +; RV32-NEXT: j .LBB2_6 +; RV32-NEXT: .LBB2_5: +; RV32-NEXT: srli a0, ra, 1 +; RV32-NEXT: or a0, ra, a0 +; RV32-NEXT: srli a3, a0, 2 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 8 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: srli a3, a0, 16 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: srli a3, a0, 1 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: sub a0, a0, a3 +; RV32-NEXT: and a3, a0, t3 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: and a0, a0, t3 +; RV32-NEXT: add a0, a3, a0 +; RV32-NEXT: srli a3, a0, 4 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: and a0, a0, t2 +; RV32-NEXT: slli a3, a0, 8 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: slli a3, a0, 16 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: srli t5, a0, 24 +; RV32-NEXT: .LBB2_6: # %_udiv-special-cases +; RV32-NEXT: lw a7, 12(a1) +; RV32-NEXT: addi a0, t6, 64 +; RV32-NEXT: bnez s0, .LBB2_8 +; RV32-NEXT: # %bb.7: # %_udiv-special-cases +; RV32-NEXT: mv t5, a0 +; RV32-NEXT: .LBB2_8: # %_udiv-special-cases +; RV32-NEXT: lw t1, 0(a1) +; RV32-NEXT: lw t0, 8(a1) +; RV32-NEXT: snez s3, s0 +; RV32-NEXT: bnez a6, .LBB2_10 +; RV32-NEXT: # %bb.9: # %_udiv-special-cases +; RV32-NEXT: srli a1, t1, 1 +; RV32-NEXT: or a1, t1, a1 +; RV32-NEXT: srli a3, a1, 2 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: srli a3, a1, 4 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: srli a3, a1, 8 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: srli a3, a1, 16 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: not a1, a1 +; RV32-NEXT: srli a3, a1, 1 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: sub a1, a1, a3 +; RV32-NEXT: and a3, a1, t3 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: and a1, a1, t3 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: srli a3, a1, 4 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: and a1, a1, t2 +; RV32-NEXT: slli a3, a1, 8 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: slli a3, a1, 16 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: srli a1, a1, 24 +; RV32-NEXT: addi a3, a1, 32 +; RV32-NEXT: j .LBB2_11 +; RV32-NEXT: .LBB2_10: +; RV32-NEXT: srli a1, a6, 1 +; RV32-NEXT: or a1, a6, a1 +; RV32-NEXT: srli a3, a1, 2 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: srli a3, a1, 4 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: srli a3, a1, 8 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: srli a3, a1, 16 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: not a1, a1 +; RV32-NEXT: srli a3, a1, 1 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: sub a1, a1, a3 +; RV32-NEXT: and a3, a1, t3 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: and a1, a1, t3 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: srli a3, a1, 4 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: and a1, a1, t2 +; RV32-NEXT: slli a3, a1, 8 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: slli a3, a1, 16 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: srli a3, a1, 24 +; RV32-NEXT: .LBB2_11: # %_udiv-special-cases +; RV32-NEXT: or a1, s9, ra +; RV32-NEXT: or s0, s8, s11 +; RV32-NEXT: or s1, a6, a7 +; RV32-NEXT: or s2, t1, t0 +; RV32-NEXT: sltu t6, a0, t6 +; RV32-NEXT: addi s3, s3, -1 +; RV32-NEXT: addi a0, a3, 64 +; RV32-NEXT: or s4, t0, a7 +; RV32-NEXT: sltu s5, a0, a3 +; RV32-NEXT: snez s6, s4 +; RV32-NEXT: addi s6, s6, -1 +; RV32-NEXT: bnez a7, .LBB2_13 +; RV32-NEXT: # %bb.12: # %_udiv-special-cases +; RV32-NEXT: srli a3, t0, 1 +; RV32-NEXT: or a3, t0, a3 +; RV32-NEXT: srli a4, a3, 2 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 8 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 16 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: not a3, a3 +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: and a4, a4, t4 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: and a4, a3, t3 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: and a3, a3, t3 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: and a3, a3, t2 +; RV32-NEXT: slli a4, a3, 8 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a3, 16 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: srli a3, a3, 24 +; RV32-NEXT: addi a3, a3, 32 +; RV32-NEXT: j .LBB2_14 +; RV32-NEXT: .LBB2_13: +; RV32-NEXT: srli a3, a7, 1 +; RV32-NEXT: or a3, a7, a3 +; RV32-NEXT: srli a4, a3, 2 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 8 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 16 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: not a3, a3 +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: and a4, a4, t4 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: and a4, a3, t3 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: and a3, a3, t3 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: and a3, a3, t2 +; RV32-NEXT: slli a4, a3, 8 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a3, 16 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: srli a3, a3, 24 +; RV32-NEXT: .LBB2_14: # %_udiv-special-cases +; RV32-NEXT: or s0, s0, a1 +; RV32-NEXT: or a5, s2, s1 +; RV32-NEXT: and a1, s3, t6 +; RV32-NEXT: and a4, s6, s5 +; RV32-NEXT: bnez s4, .LBB2_16 +; RV32-NEXT: # %bb.15: # %_udiv-special-cases +; RV32-NEXT: mv a3, a0 +; RV32-NEXT: .LBB2_16: # %_udiv-special-cases +; RV32-NEXT: seqz a0, s0 +; RV32-NEXT: seqz a5, a5 +; RV32-NEXT: sltu t2, t5, a3 +; RV32-NEXT: sub t4, a1, a4 +; RV32-NEXT: mv t3, t2 +; RV32-NEXT: beq a1, a4, .LBB2_18 +; RV32-NEXT: # %bb.17: # %_udiv-special-cases +; RV32-NEXT: sltu t3, a1, a4 +; RV32-NEXT: .LBB2_18: # %_udiv-special-cases +; RV32-NEXT: sub t2, t4, t2 +; RV32-NEXT: or a0, a0, a5 +; RV32-NEXT: neg t4, t3 +; RV32-NEXT: seqz t6, t3 +; RV32-NEXT: addi t6, t6, -1 +; RV32-NEXT: or a1, t4, t6 +; RV32-NEXT: sub t3, t5, a3 +; RV32-NEXT: beqz a1, .LBB2_20 +; RV32-NEXT: # %bb.19: # %_udiv-special-cases +; RV32-NEXT: snez a1, a1 +; RV32-NEXT: j .LBB2_21 +; RV32-NEXT: .LBB2_20: +; RV32-NEXT: snez a1, t2 +; RV32-NEXT: sltiu a3, t3, 128 +; RV32-NEXT: xori a3, a3, 1 +; RV32-NEXT: or a1, a3, a1 +; RV32-NEXT: .LBB2_21: # %_udiv-special-cases +; RV32-NEXT: or a5, a0, a1 +; RV32-NEXT: addi a3, a5, -1 +; RV32-NEXT: and a0, a3, a7 +; RV32-NEXT: and a1, a3, t0 +; RV32-NEXT: and a4, a3, a6 +; RV32-NEXT: and a3, a3, t1 +; RV32-NEXT: bnez a5, .LBB2_26 +; RV32-NEXT: # %bb.22: # %_udiv-special-cases +; RV32-NEXT: xori a5, t3, 127 +; RV32-NEXT: or a5, a5, t4 +; RV32-NEXT: or t5, t2, t6 +; RV32-NEXT: or a5, a5, t5 +; RV32-NEXT: beqz a5, .LBB2_26 +; RV32-NEXT: # %bb.23: # %udiv-bb1 +; RV32-NEXT: sw s7, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: addi a1, t3, 1 +; RV32-NEXT: sw zero, 72(sp) +; RV32-NEXT: sw zero, 76(sp) +; RV32-NEXT: sw zero, 80(sp) +; RV32-NEXT: sw zero, 84(sp) +; RV32-NEXT: sw t1, 88(sp) +; RV32-NEXT: sw a6, 92(sp) +; RV32-NEXT: sw t0, 96(sp) +; RV32-NEXT: sw a7, 100(sp) +; RV32-NEXT: li a0, 127 +; RV32-NEXT: addi a2, sp, 88 +; RV32-NEXT: seqz a3, a1 +; RV32-NEXT: sub a0, a0, t3 +; RV32-NEXT: add t2, t2, a3 +; RV32-NEXT: andi a3, a0, 31 +; RV32-NEXT: srli a0, a0, 3 +; RV32-NEXT: or a4, a1, t2 +; RV32-NEXT: xori a3, a3, 31 +; RV32-NEXT: andi a0, a0, 12 +; RV32-NEXT: seqz t5, a4 +; RV32-NEXT: sub a2, a2, a0 +; RV32-NEXT: add t5, t4, t5 +; RV32-NEXT: lw a0, 0(a2) +; RV32-NEXT: lw a4, 4(a2) +; RV32-NEXT: lw a5, 8(a2) +; RV32-NEXT: lw a2, 12(a2) +; RV32-NEXT: sltu t4, t5, t4 +; RV32-NEXT: or s0, a1, t5 +; RV32-NEXT: add t4, t6, t4 +; RV32-NEXT: or t6, t2, t4 +; RV32-NEXT: or s0, s0, t6 +; RV32-NEXT: srli t6, a5, 1 +; RV32-NEXT: srli s1, a4, 1 +; RV32-NEXT: srli s2, a0, 1 +; RV32-NEXT: srl t6, t6, a3 +; RV32-NEXT: srl s1, s1, a3 +; RV32-NEXT: srl a3, s2, a3 +; RV32-NEXT: not t3, t3 +; RV32-NEXT: sll a2, a2, t3 +; RV32-NEXT: or s2, a2, t6 +; RV32-NEXT: sll a2, a5, t3 +; RV32-NEXT: sll a4, a4, t3 +; RV32-NEXT: or s1, a2, s1 +; RV32-NEXT: or t6, a4, a3 +; RV32-NEXT: sll t3, a0, t3 +; RV32-NEXT: bnez s0, .LBB2_27 +; RV32-NEXT: # %bb.24: +; RV32-NEXT: li s6, 0 +; RV32-NEXT: li s7, 0 +; RV32-NEXT: li s8, 0 +; RV32-NEXT: .LBB2_25: # %udiv-loop-exit +; RV32-NEXT: srli a0, s1, 31 +; RV32-NEXT: slli s2, s2, 1 +; RV32-NEXT: or a0, s2, a0 +; RV32-NEXT: srli a1, t6, 31 +; RV32-NEXT: slli s1, s1, 1 +; RV32-NEXT: or a1, s1, a1 +; RV32-NEXT: srli a2, t3, 31 +; RV32-NEXT: slli t6, t6, 1 +; RV32-NEXT: slli a3, t3, 1 +; RV32-NEXT: or a3, s0, a3 +; RV32-NEXT: or a2, s6, a2 +; RV32-NEXT: or a4, a2, t6 +; RV32-NEXT: or a1, s7, a1 +; RV32-NEXT: or a0, s8, a0 +; RV32-NEXT: lw s7, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: .LBB2_26: # %udiv-end +; RV32-NEXT: sw a3, 0(s7) +; RV32-NEXT: sw a4, 4(s7) +; RV32-NEXT: sw a1, 8(s7) +; RV32-NEXT: sw a0, 12(s7) +; RV32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 148(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 144(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 140(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 136(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 132(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 128(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 124(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s8, 120(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s9, 116(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s10, 112(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s11, 108(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 160 +; RV32-NEXT: ret +; RV32-NEXT: .LBB2_27: # %udiv-preheader +; RV32-NEXT: li s0, 0 +; RV32-NEXT: li s5, 0 +; RV32-NEXT: li s3, 0 +; RV32-NEXT: li s4, 0 +; RV32-NEXT: sw zero, 56(sp) +; RV32-NEXT: sw zero, 60(sp) +; RV32-NEXT: sw zero, 64(sp) +; RV32-NEXT: sw zero, 68(sp) +; RV32-NEXT: sw t1, 40(sp) +; RV32-NEXT: sw a6, 44(sp) +; RV32-NEXT: sw t0, 48(sp) +; RV32-NEXT: sw a7, 52(sp) +; RV32-NEXT: srli a0, a1, 3 +; RV32-NEXT: addi a2, sp, 40 +; RV32-NEXT: andi a0, a0, 12 +; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: lw a2, 4(a0) +; RV32-NEXT: lw a3, 8(a0) +; RV32-NEXT: lw a4, 12(a0) +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: andi a5, a1, 31 +; RV32-NEXT: xori a5, a5, 31 +; RV32-NEXT: slli a6, a4, 1 +; RV32-NEXT: slli a7, a3, 1 +; RV32-NEXT: slli t0, a2, 1 +; RV32-NEXT: sll a6, a6, a5 +; RV32-NEXT: sll a7, a7, a5 +; RV32-NEXT: sll a5, t0, a5 +; RV32-NEXT: seqz t0, s8 +; RV32-NEXT: srl a3, a3, a1 +; RV32-NEXT: or s10, a3, a6 +; RV32-NEXT: or a3, s8, s9 +; RV32-NEXT: sw s9, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: sub a6, s9, t0 +; RV32-NEXT: seqz a3, a3 +; RV32-NEXT: srl a2, a2, a1 +; RV32-NEXT: or s9, a2, a7 +; RV32-NEXT: sub a7, s11, a3 +; RV32-NEXT: sw s11, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sltu a2, s11, a3 +; RV32-NEXT: sw ra, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sub a2, ra, a2 +; RV32-NEXT: sw a2, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: srl a0, a0, a1 +; RV32-NEXT: srl ra, a4, a1 +; RV32-NEXT: or t1, a0, a5 +; RV32-NEXT: sw s8, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s8, s8, -1 +; RV32-NEXT: sw s8, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: li s7, 0 +; RV32-NEXT: li s8, 0 +; RV32-NEXT: j .LBB2_29 +; RV32-NEXT: .LBB2_28: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1 +; RV32-NEXT: li s6, 0 +; RV32-NEXT: sub a0, a0, a5 +; RV32-NEXT: srli a5, s1, 31 +; RV32-NEXT: slli s2, s2, 1 +; RV32-NEXT: or a5, s2, a5 +; RV32-NEXT: srli s2, t6, 31 +; RV32-NEXT: slli s1, s1, 1 +; RV32-NEXT: or s1, s1, s2 +; RV32-NEXT: srli s2, t3, 31 +; RV32-NEXT: slli t6, t6, 1 +; RV32-NEXT: slli t3, t3, 1 +; RV32-NEXT: or t6, t6, s2 +; RV32-NEXT: lw a2, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: and s2, s10, a2 +; RV32-NEXT: or t3, s0, t3 +; RV32-NEXT: sub a2, a3, s2 +; RV32-NEXT: sltu a3, a3, s2 +; RV32-NEXT: lw t0, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: and s0, s10, t0 +; RV32-NEXT: sub t0, s9, s0 +; RV32-NEXT: or s2, a1, t2 +; RV32-NEXT: sub s9, a0, a4 +; RV32-NEXT: seqz a0, a1 +; RV32-NEXT: sub t2, t2, a0 +; RV32-NEXT: or t6, s5, t6 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: andi s0, s10, 1 +; RV32-NEXT: seqz a0, s2 +; RV32-NEXT: or s1, s3, s1 +; RV32-NEXT: or s2, s4, a5 +; RV32-NEXT: sub s10, a2, ra +; RV32-NEXT: sltu a2, a2, ra +; RV32-NEXT: sub a3, t0, a3 +; RV32-NEXT: sltu a4, t5, a0 +; RV32-NEXT: sub t5, t5, a0 +; RV32-NEXT: sub ra, a3, a2 +; RV32-NEXT: sub t4, t4, a4 +; RV32-NEXT: or a0, t2, t4 +; RV32-NEXT: or a2, a1, t5 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: sub t1, s11, t1 +; RV32-NEXT: li s5, 0 +; RV32-NEXT: li s3, 0 +; RV32-NEXT: li s4, 0 +; RV32-NEXT: beqz a0, .LBB2_25 +; RV32-NEXT: .LBB2_29: # %udiv-do-while +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: srli a0, t1, 31 +; RV32-NEXT: slli a3, s9, 1 +; RV32-NEXT: slli t1, t1, 1 +; RV32-NEXT: or a0, a3, a0 +; RV32-NEXT: srli a3, s2, 31 +; RV32-NEXT: or s11, t1, a3 +; RV32-NEXT: beq a6, a0, .LBB2_31 +; RV32-NEXT: # %bb.30: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1 +; RV32-NEXT: sltu a4, a6, a0 +; RV32-NEXT: j .LBB2_32 +; RV32-NEXT: .LBB2_31: # in Loop: Header=BB2_29 Depth=1 +; RV32-NEXT: lw a2, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: sltu a4, a2, s11 +; RV32-NEXT: .LBB2_32: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1 +; RV32-NEXT: lw a2, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: srli a3, s10, 31 +; RV32-NEXT: slli ra, ra, 1 +; RV32-NEXT: srli a5, s9, 31 +; RV32-NEXT: slli s10, s10, 1 +; RV32-NEXT: or s9, ra, a3 +; RV32-NEXT: or a3, s10, a5 +; RV32-NEXT: sub a5, a7, a3 +; RV32-NEXT: sltu t1, a7, a3 +; RV32-NEXT: lw t0, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: sub s6, t0, s9 +; RV32-NEXT: sltu a4, a5, a4 +; RV32-NEXT: sub a5, s6, t1 +; RV32-NEXT: sub a5, a5, a4 +; RV32-NEXT: srai s10, a5, 31 +; RV32-NEXT: and t1, s10, a2 +; RV32-NEXT: lw a2, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: and a5, s10, a2 +; RV32-NEXT: sltu a4, s11, t1 +; RV32-NEXT: mv ra, a4 +; RV32-NEXT: beq a0, a5, .LBB2_28 +; RV32-NEXT: # %bb.33: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1 +; RV32-NEXT: sltu ra, a0, a5 +; RV32-NEXT: j .LBB2_28 +; +; RV64-LABEL: udiv_i128: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: call __udivti3 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret %res = udiv i128 %x, %y ret i128 %res } define i129 @udiv_i129(i129 %x, i129 %y) nounwind { -; CHECK-LABEL: udiv_i129: -; CHECK-NOT: call{{.*}}div +; RV32-LABEL: udiv_i129: +; RV32: # %bb.0: # %_udiv-special-cases +; RV32-NEXT: addi sp, sp, -240 +; RV32-NEXT: sw ra, 236(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 232(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 228(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 224(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 220(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 216(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 212(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 208(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 204(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 200(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s9, 196(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s10, 192(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s11, 188(sp) # 4-byte Folded Spill +; RV32-NEXT: mv ra, a0 +; RV32-NEXT: lw t2, 16(a2) +; RV32-NEXT: lw a4, 0(a2) +; RV32-NEXT: lw a5, 4(a2) +; RV32-NEXT: lw a6, 8(a2) +; RV32-NEXT: lw a0, 12(a2) +; RV32-NEXT: sw a0, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: lui a0, 349525 +; RV32-NEXT: lui a2, 209715 +; RV32-NEXT: lui a3, 61681 +; RV32-NEXT: addi t5, a0, 1365 +; RV32-NEXT: addi t4, a2, 819 +; RV32-NEXT: addi t3, a3, -241 +; RV32-NEXT: sw a6, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: slli a0, a6, 31 +; RV32-NEXT: srli a2, a5, 1 +; RV32-NEXT: sw a5, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: slli a3, a5, 31 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: sw a4, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: srli a2, a4, 1 +; RV32-NEXT: or a2, a2, a3 +; RV32-NEXT: bnez a0, .LBB3_2 +; RV32-NEXT: # %bb.1: # %_udiv-special-cases +; RV32-NEXT: srli a3, a2, 1 +; RV32-NEXT: or a3, a2, a3 +; RV32-NEXT: srli a4, a3, 2 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 8 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 16 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: not a3, a3 +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: and a4, a4, t5 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: and a4, a3, t4 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: and a3, a3, t3 +; RV32-NEXT: slli a4, a3, 8 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a3, 16 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: srli a3, a3, 24 +; RV32-NEXT: addi a6, a3, 32 +; RV32-NEXT: j .LBB3_3 +; RV32-NEXT: .LBB3_2: +; RV32-NEXT: srli a3, a0, 1 +; RV32-NEXT: or a3, a0, a3 +; RV32-NEXT: srli a4, a3, 2 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 8 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 16 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: not a3, a3 +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: and a4, a4, t5 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: and a4, a3, t4 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: and a3, a3, t3 +; RV32-NEXT: slli a4, a3, 8 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a3, 16 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: srli a6, a3, 24 +; RV32-NEXT: .LBB3_3: # %_udiv-special-cases +; RV32-NEXT: lw a7, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: srli a3, a7, 1 +; RV32-NEXT: slli a5, t2, 31 +; RV32-NEXT: slli a7, a7, 31 +; RV32-NEXT: lw a4, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: srli t0, a4, 1 +; RV32-NEXT: lw a4, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: slli a4, a4, 31 +; RV32-NEXT: li s2, 64 +; RV32-NEXT: bnez a4, .LBB3_5 +; RV32-NEXT: # %bb.4: # %_udiv-special-cases +; RV32-NEXT: li t6, 64 +; RV32-NEXT: j .LBB3_6 +; RV32-NEXT: .LBB3_5: +; RV32-NEXT: srli t1, a4, 1 +; RV32-NEXT: or t1, a4, t1 +; RV32-NEXT: srli t6, t1, 2 +; RV32-NEXT: or t1, t1, t6 +; RV32-NEXT: srli t6, t1, 4 +; RV32-NEXT: or t1, t1, t6 +; RV32-NEXT: srli t6, t1, 8 +; RV32-NEXT: or t1, t1, t6 +; RV32-NEXT: srli t6, t1, 16 +; RV32-NEXT: or t1, t1, t6 +; RV32-NEXT: not t1, t1 +; RV32-NEXT: srli t6, t1, 1 +; RV32-NEXT: and t6, t6, t5 +; RV32-NEXT: sub t1, t1, t6 +; RV32-NEXT: and t6, t1, t4 +; RV32-NEXT: srli t1, t1, 2 +; RV32-NEXT: and t1, t1, t4 +; RV32-NEXT: add t1, t6, t1 +; RV32-NEXT: srli t6, t1, 4 +; RV32-NEXT: add t1, t1, t6 +; RV32-NEXT: and t1, t1, t3 +; RV32-NEXT: slli t6, t1, 8 +; RV32-NEXT: add t1, t1, t6 +; RV32-NEXT: slli t6, t1, 16 +; RV32-NEXT: add t1, t1, t6 +; RV32-NEXT: srli t6, t1, 24 +; RV32-NEXT: .LBB3_6: # %_udiv-special-cases +; RV32-NEXT: or t1, a5, a3 +; RV32-NEXT: or a7, t0, a7 +; RV32-NEXT: bnez a4, .LBB3_8 +; RV32-NEXT: # %bb.7: # %_udiv-special-cases +; RV32-NEXT: li t6, 128 +; RV32-NEXT: .LBB3_8: # %_udiv-special-cases +; RV32-NEXT: or a5, a7, t1 +; RV32-NEXT: addi a4, a6, 64 +; RV32-NEXT: addi a3, t6, 128 +; RV32-NEXT: or a0, a0, t1 +; RV32-NEXT: or a2, a2, a7 +; RV32-NEXT: or s3, a2, a0 +; RV32-NEXT: sltu s0, a3, t6 +; RV32-NEXT: bnez s3, .LBB3_11 +; RV32-NEXT: # %bb.9: # %_udiv-special-cases +; RV32-NEXT: mv t6, s0 +; RV32-NEXT: beqz t1, .LBB3_12 +; RV32-NEXT: .LBB3_10: +; RV32-NEXT: srli a0, t1, 1 +; RV32-NEXT: or a0, t1, a0 +; RV32-NEXT: srli a2, a0, 2 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a2, a0, 4 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a2, a0, 8 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a2, a0, 16 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: srli a2, a0, 1 +; RV32-NEXT: and a2, a2, t5 +; RV32-NEXT: sub a0, a0, a2 +; RV32-NEXT: and a2, a0, t4 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: and a0, a0, t4 +; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: srli a2, a0, 4 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: and a0, a0, t3 +; RV32-NEXT: slli a2, a0, 8 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: slli a2, a0, 16 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: srli s1, a0, 24 +; RV32-NEXT: beqz a5, .LBB3_13 +; RV32-NEXT: j .LBB3_14 +; RV32-NEXT: .LBB3_11: +; RV32-NEXT: snez a0, a5 +; RV32-NEXT: sltu a2, a4, a6 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and t6, a0, a2 +; RV32-NEXT: bnez t1, .LBB3_10 +; RV32-NEXT: .LBB3_12: # %_udiv-special-cases +; RV32-NEXT: srli a0, a7, 1 +; RV32-NEXT: or a0, a7, a0 +; RV32-NEXT: srli a2, a0, 2 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a2, a0, 4 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a2, a0, 8 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a2, a0, 16 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: srli a2, a0, 1 +; RV32-NEXT: and a2, a2, t5 +; RV32-NEXT: sub a0, a0, a2 +; RV32-NEXT: and a2, a0, t4 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: and a0, a0, t4 +; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: srli a2, a0, 4 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: and a0, a0, t3 +; RV32-NEXT: slli a2, a0, 8 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: slli a2, a0, 16 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: addi s1, a0, 32 +; RV32-NEXT: bnez a5, .LBB3_14 +; RV32-NEXT: .LBB3_13: # %_udiv-special-cases +; RV32-NEXT: mv s1, a4 +; RV32-NEXT: .LBB3_14: # %_udiv-special-cases +; RV32-NEXT: lw a7, 0(a1) +; RV32-NEXT: lw t0, 4(a1) +; RV32-NEXT: lw a6, 8(a1) +; RV32-NEXT: bnez s3, .LBB3_16 +; RV32-NEXT: # %bb.15: # %_udiv-special-cases +; RV32-NEXT: mv s1, a3 +; RV32-NEXT: .LBB3_16: # %_udiv-special-cases +; RV32-NEXT: lw t1, 12(a1) +; RV32-NEXT: lw a1, 16(a1) +; RV32-NEXT: slli a0, a6, 31 +; RV32-NEXT: srli a2, t0, 1 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: slli a2, t0, 31 +; RV32-NEXT: srli a3, a7, 1 +; RV32-NEXT: or a2, a3, a2 +; RV32-NEXT: bnez a0, .LBB3_18 +; RV32-NEXT: # %bb.17: # %_udiv-special-cases +; RV32-NEXT: srli a3, a2, 1 +; RV32-NEXT: or a3, a2, a3 +; RV32-NEXT: srli a4, a3, 2 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 8 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 16 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: not a3, a3 +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: and a4, a4, t5 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: and a4, a3, t4 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: and a3, a3, t3 +; RV32-NEXT: slli a4, a3, 8 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a3, 16 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: srli a3, a3, 24 +; RV32-NEXT: addi s5, a3, 32 +; RV32-NEXT: j .LBB3_19 +; RV32-NEXT: .LBB3_18: +; RV32-NEXT: srli a3, a0, 1 +; RV32-NEXT: or a3, a0, a3 +; RV32-NEXT: srli a4, a3, 2 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 8 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: srli a4, a3, 16 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: not a3, a3 +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: and a4, a4, t5 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: and a4, a3, t4 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: and a3, a3, t4 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: srli a4, a3, 4 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: and a3, a3, t3 +; RV32-NEXT: slli a4, a3, 8 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a3, 16 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: srli s5, a3, 24 +; RV32-NEXT: .LBB3_19: # %_udiv-special-cases +; RV32-NEXT: srli a3, t1, 1 +; RV32-NEXT: slli a4, a1, 31 +; RV32-NEXT: slli a5, t1, 31 +; RV32-NEXT: slli s4, a7, 31 +; RV32-NEXT: srli s6, a6, 1 +; RV32-NEXT: beqz s4, .LBB3_21 +; RV32-NEXT: # %bb.20: +; RV32-NEXT: srli s2, s4, 1 +; RV32-NEXT: or s2, s4, s2 +; RV32-NEXT: srli s7, s2, 2 +; RV32-NEXT: or s2, s2, s7 +; RV32-NEXT: srli s7, s2, 4 +; RV32-NEXT: or s2, s2, s7 +; RV32-NEXT: srli s7, s2, 8 +; RV32-NEXT: or s2, s2, s7 +; RV32-NEXT: srli s7, s2, 16 +; RV32-NEXT: or s2, s2, s7 +; RV32-NEXT: not s2, s2 +; RV32-NEXT: srli s7, s2, 1 +; RV32-NEXT: and s7, s7, t5 +; RV32-NEXT: sub s2, s2, s7 +; RV32-NEXT: and s7, s2, t4 +; RV32-NEXT: srli s2, s2, 2 +; RV32-NEXT: and s2, s2, t4 +; RV32-NEXT: add s2, s7, s2 +; RV32-NEXT: srli s7, s2, 4 +; RV32-NEXT: add s2, s2, s7 +; RV32-NEXT: and s2, s2, t3 +; RV32-NEXT: slli s7, s2, 8 +; RV32-NEXT: add s2, s2, s7 +; RV32-NEXT: slli s7, s2, 16 +; RV32-NEXT: add s2, s2, s7 +; RV32-NEXT: srli s2, s2, 24 +; RV32-NEXT: .LBB3_21: # %_udiv-special-cases +; RV32-NEXT: or s7, a4, a3 +; RV32-NEXT: or s6, s6, a5 +; RV32-NEXT: bnez s4, .LBB3_23 +; RV32-NEXT: # %bb.22: # %_udiv-special-cases +; RV32-NEXT: li s2, 128 +; RV32-NEXT: .LBB3_23: # %_udiv-special-cases +; RV32-NEXT: or s4, s6, s7 +; RV32-NEXT: addi a5, s5, 64 +; RV32-NEXT: addi a3, s2, 128 +; RV32-NEXT: or a0, a0, s7 +; RV32-NEXT: or a4, a2, s6 +; RV32-NEXT: or a4, a4, a0 +; RV32-NEXT: sltu a0, a3, s2 +; RV32-NEXT: bnez a4, .LBB3_26 +; RV32-NEXT: # %bb.24: # %_udiv-special-cases +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: snez s2, s3 +; RV32-NEXT: beqz s7, .LBB3_27 +; RV32-NEXT: .LBB3_25: +; RV32-NEXT: srli s3, s7, 1 +; RV32-NEXT: or s3, s7, s3 +; RV32-NEXT: srli s5, s3, 2 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli s5, s3, 4 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli s5, s3, 8 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli s5, s3, 16 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: not s3, s3 +; RV32-NEXT: srli s5, s3, 1 +; RV32-NEXT: and t5, s5, t5 +; RV32-NEXT: sub t5, s3, t5 +; RV32-NEXT: and s3, t5, t4 +; RV32-NEXT: srli t5, t5, 2 +; RV32-NEXT: and t4, t5, t4 +; RV32-NEXT: add t4, s3, t4 +; RV32-NEXT: srli t5, t4, 4 +; RV32-NEXT: add t4, t4, t5 +; RV32-NEXT: and t3, t4, t3 +; RV32-NEXT: slli t4, t3, 8 +; RV32-NEXT: add t3, t3, t4 +; RV32-NEXT: slli t4, t3, 16 +; RV32-NEXT: add t3, t3, t4 +; RV32-NEXT: srli t3, t3, 24 +; RV32-NEXT: j .LBB3_28 +; RV32-NEXT: .LBB3_26: +; RV32-NEXT: snez a2, s4 +; RV32-NEXT: sltu s2, a5, s5 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a2, a2, s2 +; RV32-NEXT: snez s2, s3 +; RV32-NEXT: bnez s7, .LBB3_25 +; RV32-NEXT: .LBB3_27: # %_udiv-special-cases +; RV32-NEXT: srli s3, s6, 1 +; RV32-NEXT: or s3, s6, s3 +; RV32-NEXT: srli s5, s3, 2 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli s5, s3, 4 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli s5, s3, 8 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: srli s5, s3, 16 +; RV32-NEXT: or s3, s3, s5 +; RV32-NEXT: not s3, s3 +; RV32-NEXT: srli s5, s3, 1 +; RV32-NEXT: and t5, s5, t5 +; RV32-NEXT: sub t5, s3, t5 +; RV32-NEXT: and s3, t5, t4 +; RV32-NEXT: srli t5, t5, 2 +; RV32-NEXT: and t4, t5, t4 +; RV32-NEXT: add t4, s3, t4 +; RV32-NEXT: srli t5, t4, 4 +; RV32-NEXT: add t4, t4, t5 +; RV32-NEXT: and t3, t4, t3 +; RV32-NEXT: slli t4, t3, 8 +; RV32-NEXT: add t3, t3, t4 +; RV32-NEXT: slli t4, t3, 16 +; RV32-NEXT: add t3, t3, t4 +; RV32-NEXT: srli t3, t3, 24 +; RV32-NEXT: addi t3, t3, 32 +; RV32-NEXT: .LBB3_28: # %_udiv-special-cases +; RV32-NEXT: xori t4, s0, 1 +; RV32-NEXT: addi s2, s2, -1 +; RV32-NEXT: bnez s4, .LBB3_30 +; RV32-NEXT: # %bb.29: # %_udiv-special-cases +; RV32-NEXT: mv t3, a5 +; RV32-NEXT: .LBB3_30: # %_udiv-special-cases +; RV32-NEXT: andi s11, a1, 1 +; RV32-NEXT: andi s8, t2, 1 +; RV32-NEXT: lw a1, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: lw a5, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: or s9, a1, a5 +; RV32-NEXT: or t2, a7, a6 +; RV32-NEXT: neg a1, t4 +; RV32-NEXT: and s0, s2, s0 +; RV32-NEXT: bnez a4, .LBB3_32 +; RV32-NEXT: # %bb.31: # %_udiv-special-cases +; RV32-NEXT: mv t3, a3 +; RV32-NEXT: .LBB3_32: # %_udiv-special-cases +; RV32-NEXT: lw a3, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw a5, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: or s10, a3, a5 +; RV32-NEXT: or a5, s9, s8 +; RV32-NEXT: or t4, t0, t1 +; RV32-NEXT: or t5, t2, s11 +; RV32-NEXT: and a1, s0, a1 +; RV32-NEXT: xori a3, a0, 1 +; RV32-NEXT: snez a4, a4 +; RV32-NEXT: neg a3, a3 +; RV32-NEXT: addi a4, a4, -1 +; RV32-NEXT: and a0, a4, a0 +; RV32-NEXT: sltu a4, s1, t3 +; RV32-NEXT: and t2, a0, a3 +; RV32-NEXT: mv a3, a4 +; RV32-NEXT: beq t6, a2, .LBB3_34 +; RV32-NEXT: # %bb.33: # %_udiv-special-cases +; RV32-NEXT: sltu a3, t6, a2 +; RV32-NEXT: .LBB3_34: # %_udiv-special-cases +; RV32-NEXT: or a0, a5, s10 +; RV32-NEXT: or t5, t5, t4 +; RV32-NEXT: sltu t4, a1, t2 +; RV32-NEXT: mv s0, a3 +; RV32-NEXT: beq a1, t2, .LBB3_36 +; RV32-NEXT: # %bb.35: # %_udiv-special-cases +; RV32-NEXT: mv s0, t4 +; RV32-NEXT: .LBB3_36: # %_udiv-special-cases +; RV32-NEXT: seqz a5, a0 +; RV32-NEXT: seqz t5, t5 +; RV32-NEXT: andi a0, s0, 1 +; RV32-NEXT: sub a2, t6, a2 +; RV32-NEXT: sub a1, a1, t2 +; RV32-NEXT: sub t2, a2, a4 +; RV32-NEXT: sltu a2, a1, a3 +; RV32-NEXT: add a2, t4, a2 +; RV32-NEXT: neg t4, a2 +; RV32-NEXT: sub a4, a1, a3 +; RV32-NEXT: or a1, a4, t4 +; RV32-NEXT: sub a3, s1, t3 +; RV32-NEXT: beqz a1, .LBB3_38 +; RV32-NEXT: # %bb.37: # %_udiv-special-cases +; RV32-NEXT: snez a1, a1 +; RV32-NEXT: or a2, a5, t5 +; RV32-NEXT: bnez a0, .LBB3_39 +; RV32-NEXT: j .LBB3_40 +; RV32-NEXT: .LBB3_38: +; RV32-NEXT: snez a1, t2 +; RV32-NEXT: sltiu a2, a3, 129 +; RV32-NEXT: xori a2, a2, 1 +; RV32-NEXT: or a1, a2, a1 +; RV32-NEXT: or a2, a5, t5 +; RV32-NEXT: beqz a0, .LBB3_40 +; RV32-NEXT: .LBB3_39: # %_udiv-special-cases +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: .LBB3_40: # %_udiv-special-cases +; RV32-NEXT: or t6, a2, a1 +; RV32-NEXT: addi a1, t6, -1 +; RV32-NEXT: and a2, s11, a1 +; RV32-NEXT: and a5, a1, t1 +; RV32-NEXT: and t3, a1, a6 +; RV32-NEXT: and t5, a1, t0 +; RV32-NEXT: and a1, a1, a7 +; RV32-NEXT: bnez t6, .LBB3_57 +; RV32-NEXT: # %bb.41: # %_udiv-special-cases +; RV32-NEXT: or t6, t2, t4 +; RV32-NEXT: xori s0, a3, 128 +; RV32-NEXT: or s0, s0, a0 +; RV32-NEXT: or s0, s0, a4 +; RV32-NEXT: or t6, s0, t6 +; RV32-NEXT: beqz t6, .LBB3_57 +; RV32-NEXT: # %bb.42: # %udiv-bb1 +; RV32-NEXT: sw ra, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: addi a1, a3, 1 +; RV32-NEXT: sw zero, 136(sp) +; RV32-NEXT: sw zero, 140(sp) +; RV32-NEXT: sw zero, 144(sp) +; RV32-NEXT: sw zero, 148(sp) +; RV32-NEXT: sw zero, 120(sp) +; RV32-NEXT: sw zero, 124(sp) +; RV32-NEXT: sw zero, 128(sp) +; RV32-NEXT: sw zero, 132(sp) +; RV32-NEXT: sw a7, 152(sp) +; RV32-NEXT: sw t0, 156(sp) +; RV32-NEXT: sw a6, 160(sp) +; RV32-NEXT: sw t1, 164(sp) +; RV32-NEXT: sw s11, 168(sp) +; RV32-NEXT: li a5, 128 +; RV32-NEXT: addi t3, sp, 152 +; RV32-NEXT: neg a2, a3 +; RV32-NEXT: seqz t5, a1 +; RV32-NEXT: sub a5, a5, a3 +; RV32-NEXT: add t2, t2, t5 +; RV32-NEXT: andi a3, a5, 31 +; RV32-NEXT: srli t5, a5, 3 +; RV32-NEXT: or t6, a1, t2 +; RV32-NEXT: xori a5, a3, 31 +; RV32-NEXT: andi a3, t5, 28 +; RV32-NEXT: seqz t6, t6 +; RV32-NEXT: sub ra, t3, a3 +; RV32-NEXT: add t6, a4, t6 +; RV32-NEXT: lw t3, 0(ra) +; RV32-NEXT: lw s0, 4(ra) +; RV32-NEXT: lw s1, 8(ra) +; RV32-NEXT: lw a3, 12(ra) +; RV32-NEXT: sltu a4, t6, a4 +; RV32-NEXT: or t5, a1, t6 +; RV32-NEXT: add t4, t4, a4 +; RV32-NEXT: or a4, t2, t4 +; RV32-NEXT: or a4, t5, a4 +; RV32-NEXT: srli t5, s1, 1 +; RV32-NEXT: seqz s2, a4 +; RV32-NEXT: add a0, a0, s2 +; RV32-NEXT: sll s2, a3, a2 +; RV32-NEXT: srl t5, t5, a5 +; RV32-NEXT: or t5, s2, t5 +; RV32-NEXT: srli s2, s0, 1 +; RV32-NEXT: sll s1, s1, a2 +; RV32-NEXT: srl s2, s2, a5 +; RV32-NEXT: or s2, s1, s2 +; RV32-NEXT: srli s1, t3, 1 +; RV32-NEXT: sll s0, s0, a2 +; RV32-NEXT: srl s1, s1, a5 +; RV32-NEXT: andi s3, a0, 1 +; RV32-NEXT: or s1, s0, s1 +; RV32-NEXT: or a0, a4, s3 +; RV32-NEXT: sll t3, t3, a2 +; RV32-NEXT: beqz a0, .LBB3_55 +; RV32-NEXT: # %bb.43: # %udiv-preheader +; RV32-NEXT: sw zero, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: sw zero, 48(sp) # 4-byte Folded Spill +; RV32-NEXT: sw zero, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw zero, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: li s7, 0 +; RV32-NEXT: srli a3, a3, 1 +; RV32-NEXT: lw a0, 16(ra) +; RV32-NEXT: sw zero, 104(sp) +; RV32-NEXT: sw zero, 108(sp) +; RV32-NEXT: sw zero, 112(sp) +; RV32-NEXT: sw zero, 116(sp) +; RV32-NEXT: sw zero, 88(sp) +; RV32-NEXT: sw zero, 92(sp) +; RV32-NEXT: sw zero, 96(sp) +; RV32-NEXT: sw zero, 100(sp) +; RV32-NEXT: sw s11, 72(sp) +; RV32-NEXT: sw zero, 76(sp) +; RV32-NEXT: sw zero, 80(sp) +; RV32-NEXT: sw zero, 84(sp) +; RV32-NEXT: sw a7, 56(sp) +; RV32-NEXT: sw t0, 60(sp) +; RV32-NEXT: sw a6, 64(sp) +; RV32-NEXT: sw t1, 68(sp) +; RV32-NEXT: srli a4, a1, 3 +; RV32-NEXT: addi a6, sp, 56 +; RV32-NEXT: andi a7, a1, 31 +; RV32-NEXT: or t0, s9, s10 +; RV32-NEXT: srl a3, a3, a5 +; RV32-NEXT: andi a4, a4, 28 +; RV32-NEXT: xori a5, a7, 31 +; RV32-NEXT: snez a7, t0 +; RV32-NEXT: add a4, a6, a4 +; RV32-NEXT: add a7, s8, a7 +; RV32-NEXT: lw a6, 16(a4) +; RV32-NEXT: lw t0, 0(a4) +; RV32-NEXT: lw t1, 4(a4) +; RV32-NEXT: lw s0, 8(a4) +; RV32-NEXT: lw a4, 12(a4) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: or a3, a0, a3 +; RV32-NEXT: slli a6, a6, 1 +; RV32-NEXT: slli a0, a4, 1 +; RV32-NEXT: slli a2, s0, 1 +; RV32-NEXT: slli s4, t1, 1 +; RV32-NEXT: sll a6, a6, a5 +; RV32-NEXT: sll a0, a0, a5 +; RV32-NEXT: sll s8, a2, a5 +; RV32-NEXT: sll s4, s4, a5 +; RV32-NEXT: srl a2, a4, a1 +; RV32-NEXT: or ra, a2, a6 +; RV32-NEXT: lw a6, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: seqz a4, a6 +; RV32-NEXT: srl a2, s0, a1 +; RV32-NEXT: or a2, a2, a0 +; RV32-NEXT: lw a5, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: or a0, a6, a5 +; RV32-NEXT: sub s5, a5, a4 +; RV32-NEXT: seqz a4, a0 +; RV32-NEXT: srl a0, t1, a1 +; RV32-NEXT: or a0, a0, s8 +; RV32-NEXT: lw a5, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: sub t1, a5, a4 +; RV32-NEXT: sw t1, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: sltu a4, a5, a4 +; RV32-NEXT: addi a7, a7, 1 +; RV32-NEXT: lw a5, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: sub s6, a5, a4 +; RV32-NEXT: andi a4, a7, 1 +; RV32-NEXT: sw a4, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: andi a5, a3, 1 +; RV32-NEXT: srl a3, t0, a1 +; RV32-NEXT: or a4, a3, s4 +; RV32-NEXT: addi a6, a6, -1 +; RV32-NEXT: sw a6, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li s11, 0 +; RV32-NEXT: li s10, 0 +; RV32-NEXT: j .LBB3_45 +; RV32-NEXT: .LBB3_44: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: and s0, a5, s0 +; RV32-NEXT: xor s8, t1, a7 +; RV32-NEXT: xor s9, a2, s0 +; RV32-NEXT: or s8, s9, s8 +; RV32-NEXT: li s9, 0 +; RV32-NEXT: li s8, 0 +; RV32-NEXT: sltu s4, a2, s0 +; RV32-NEXT: sub s0, a2, s0 +; RV32-NEXT: sub a7, t1, a7 +; RV32-NEXT: srli a2, s2, 31 +; RV32-NEXT: sub a0, a0, t0 +; RV32-NEXT: slli t0, t5, 1 +; RV32-NEXT: or t0, t0, a2 +; RV32-NEXT: srli a2, s1, 31 +; RV32-NEXT: slli s2, s2, 1 +; RV32-NEXT: or t1, s2, a2 +; RV32-NEXT: srli a2, t3, 31 +; RV32-NEXT: slli s1, s1, 1 +; RV32-NEXT: or s1, s1, a2 +; RV32-NEXT: slli t3, t3, 1 +; RV32-NEXT: lw a2, 52(sp) # 4-byte Folded Reload +; RV32-NEXT: or t3, a2, t3 +; RV32-NEXT: srli a2, t5, 31 +; RV32-NEXT: or s7, s7, a2 +; RV32-NEXT: sub a2, s0, ra +; RV32-NEXT: sltu s0, s0, ra +; RV32-NEXT: or t5, a1, t6 +; RV32-NEXT: sub a7, a7, s4 +; RV32-NEXT: or s2, t2, t4 +; RV32-NEXT: sub a0, a0, a6 +; RV32-NEXT: or a6, a1, t2 +; RV32-NEXT: or s4, t5, s2 +; RV32-NEXT: seqz t5, a1 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: andi a5, a5, 1 +; RV32-NEXT: sw a5, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: seqz a6, a6 +; RV32-NEXT: sub t2, t2, t5 +; RV32-NEXT: lw a5, 48(sp) # 4-byte Folded Reload +; RV32-NEXT: or s1, a5, s1 +; RV32-NEXT: lw a5, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: or s2, a5, t1 +; RV32-NEXT: lw a5, 40(sp) # 4-byte Folded Reload +; RV32-NEXT: or t5, a5, t0 +; RV32-NEXT: andi a5, s7, 1 +; RV32-NEXT: sub ra, a7, s0 +; RV32-NEXT: snez a7, s4 +; RV32-NEXT: sltu t0, t6, a6 +; RV32-NEXT: sub t6, t6, a6 +; RV32-NEXT: add a7, s3, a7 +; RV32-NEXT: sub t4, t4, t0 +; RV32-NEXT: or a6, a1, t6 +; RV32-NEXT: addi a7, a7, 1 +; RV32-NEXT: or t0, t2, t4 +; RV32-NEXT: andi s3, a7, 1 +; RV32-NEXT: or a6, a6, t0 +; RV32-NEXT: or a6, a6, s3 +; RV32-NEXT: sub a4, a4, a3 +; RV32-NEXT: sw zero, 48(sp) # 4-byte Folded Spill +; RV32-NEXT: sw zero, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw zero, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: li s7, 0 +; RV32-NEXT: beqz a6, .LBB3_56 +; RV32-NEXT: .LBB3_45: # %udiv-do-while +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: srli a3, a2, 31 +; RV32-NEXT: slli a6, ra, 1 +; RV32-NEXT: or t1, a6, a3 +; RV32-NEXT: srli a3, a0, 31 +; RV32-NEXT: slli a2, a2, 1 +; RV32-NEXT: or a2, a2, a3 +; RV32-NEXT: beq s6, t1, .LBB3_47 +; RV32-NEXT: # %bb.46: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: sltu a3, s6, t1 +; RV32-NEXT: j .LBB3_48 +; RV32-NEXT: .LBB3_47: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: lw a3, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: sltu a3, a3, a2 +; RV32-NEXT: .LBB3_48: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: srli a6, a4, 31 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: slli a4, a4, 1 +; RV32-NEXT: or a0, a0, a6 +; RV32-NEXT: andi a5, a5, 1 +; RV32-NEXT: or a4, a4, a5 +; RV32-NEXT: beq s5, a0, .LBB3_50 +; RV32-NEXT: # %bb.49: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: sltu a5, s5, a0 +; RV32-NEXT: j .LBB3_51 +; RV32-NEXT: .LBB3_50: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: lw a5, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: sltu a5, a5, a4 +; RV32-NEXT: .LBB3_51: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: lw a6, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: xor a6, a6, a2 +; RV32-NEXT: xor a7, s6, t1 +; RV32-NEXT: or a6, a6, a7 +; RV32-NEXT: beqz a6, .LBB3_53 +; RV32-NEXT: # %bb.52: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: mv a5, a3 +; RV32-NEXT: .LBB3_53: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: srli a3, ra, 31 +; RV32-NEXT: lw a6, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: sub a3, a6, a3 +; RV32-NEXT: sub a3, a3, a5 +; RV32-NEXT: slli a3, a3, 31 +; RV32-NEXT: srai a5, a3, 31 +; RV32-NEXT: lw a3, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: and a7, a5, a3 +; RV32-NEXT: lw a3, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: and a3, a5, a3 +; RV32-NEXT: lw a6, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: and t0, a5, a6 +; RV32-NEXT: sltu a6, a4, a3 +; RV32-NEXT: mv ra, a6 +; RV32-NEXT: beq a0, t0, .LBB3_44 +; RV32-NEXT: # %bb.54: # %udiv-do-while +; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1 +; RV32-NEXT: sltu ra, a0, t0 +; RV32-NEXT: j .LBB3_44 +; RV32-NEXT: .LBB3_55: +; RV32-NEXT: sw zero, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: li s11, 0 +; RV32-NEXT: li s9, 0 +; RV32-NEXT: li s10, 0 +; RV32-NEXT: li s8, 0 +; RV32-NEXT: .LBB3_56: # %udiv-loop-exit +; RV32-NEXT: srli a0, s2, 31 +; RV32-NEXT: slli a1, t5, 1 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: srli a1, s1, 31 +; RV32-NEXT: slli s2, s2, 1 +; RV32-NEXT: or a2, s2, a1 +; RV32-NEXT: srli a3, t3, 31 +; RV32-NEXT: slli s1, s1, 1 +; RV32-NEXT: srli a4, t5, 31 +; RV32-NEXT: slli t3, t3, 1 +; RV32-NEXT: lw a1, 52(sp) # 4-byte Folded Reload +; RV32-NEXT: or a1, a1, t3 +; RV32-NEXT: or a3, s11, a3 +; RV32-NEXT: or a4, s8, a4 +; RV32-NEXT: or t5, a3, s1 +; RV32-NEXT: or t3, s9, a2 +; RV32-NEXT: or a5, s10, a0 +; RV32-NEXT: andi a2, a4, 1 +; RV32-NEXT: lw ra, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: .LBB3_57: # %udiv-end +; RV32-NEXT: sw a1, 0(ra) +; RV32-NEXT: sw t5, 4(ra) +; RV32-NEXT: sw t3, 8(ra) +; RV32-NEXT: sw a5, 12(ra) +; RV32-NEXT: andi a2, a2, 1 +; RV32-NEXT: sb a2, 16(ra) +; RV32-NEXT: lw ra, 236(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 232(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 228(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 224(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 220(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 216(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 212(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 208(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 204(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s8, 200(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s9, 196(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s10, 192(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s11, 188(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 240 +; RV32-NEXT: ret +; +; RV64-LABEL: udiv_i129: +; RV64: # %bb.0: # %_udiv-special-cases +; RV64-NEXT: ld a3, 0(a2) +; RV64-NEXT: ld a4, 8(a2) +; RV64-NEXT: ld t1, 16(a2) +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a5, 209715 +; RV64-NEXT: lui a6, 61681 +; RV64-NEXT: addi t0, a2, 1365 +; RV64-NEXT: addi a7, a5, 819 +; RV64-NEXT: addi a6, a6, -241 +; RV64-NEXT: slli a2, t0, 32 +; RV64-NEXT: slli a5, a7, 32 +; RV64-NEXT: slli t2, a6, 32 +; RV64-NEXT: add t0, t0, a2 +; RV64-NEXT: add a7, a7, a5 +; RV64-NEXT: add a6, a6, t2 +; RV64-NEXT: srli a2, a4, 1 +; RV64-NEXT: slli a5, t1, 63 +; RV64-NEXT: slli t2, a4, 63 +; RV64-NEXT: or t3, a5, a2 +; RV64-NEXT: srli a2, a3, 1 +; RV64-NEXT: or t4, a2, t2 +; RV64-NEXT: bnez t3, .LBB3_2 +; RV64-NEXT: # %bb.1: # %_udiv-special-cases +; RV64-NEXT: srli a2, t4, 1 +; RV64-NEXT: or a2, t4, a2 +; RV64-NEXT: srli a5, a2, 2 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 4 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 8 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 16 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 32 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: not a2, a2 +; RV64-NEXT: srli a5, a2, 1 +; RV64-NEXT: and a5, a5, t0 +; RV64-NEXT: sub a2, a2, a5 +; RV64-NEXT: and a5, a2, a7 +; RV64-NEXT: srli a2, a2, 2 +; RV64-NEXT: and a2, a2, a7 +; RV64-NEXT: add a2, a5, a2 +; RV64-NEXT: srli a5, a2, 4 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: and a2, a2, a6 +; RV64-NEXT: slli a5, a2, 8 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a2, 16 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: srli a2, a2, 56 +; RV64-NEXT: addi t2, a2, 64 +; RV64-NEXT: j .LBB3_3 +; RV64-NEXT: .LBB3_2: +; RV64-NEXT: srli a2, t3, 1 +; RV64-NEXT: or a2, t3, a2 +; RV64-NEXT: srli a5, a2, 2 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 4 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 8 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 16 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 32 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: not a2, a2 +; RV64-NEXT: srli a5, a2, 1 +; RV64-NEXT: and a5, a5, t0 +; RV64-NEXT: sub a2, a2, a5 +; RV64-NEXT: and a5, a2, a7 +; RV64-NEXT: srli a2, a2, 2 +; RV64-NEXT: and a2, a2, a7 +; RV64-NEXT: add a2, a5, a2 +; RV64-NEXT: srli a5, a2, 4 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: and a2, a2, a6 +; RV64-NEXT: slli a5, a2, 8 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a2, 16 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: srli t2, a2, 56 +; RV64-NEXT: .LBB3_3: # %_udiv-special-cases +; RV64-NEXT: addi sp, sp, -192 +; RV64-NEXT: sd s0, 184(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 176(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 168(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 160(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 152(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s5, 144(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s6, 136(sp) # 8-byte Folded Spill +; RV64-NEXT: slli a2, a3, 63 +; RV64-NEXT: li t5, 128 +; RV64-NEXT: bnez a2, .LBB3_5 +; RV64-NEXT: # %bb.4: # %_udiv-special-cases +; RV64-NEXT: li s0, 128 +; RV64-NEXT: j .LBB3_6 +; RV64-NEXT: .LBB3_5: +; RV64-NEXT: srli a5, a2, 1 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 2 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 4 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 8 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 16 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: srli a5, a2, 32 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: not a2, a2 +; RV64-NEXT: srli a5, a2, 1 +; RV64-NEXT: and a5, a5, t0 +; RV64-NEXT: sub a2, a2, a5 +; RV64-NEXT: and a5, a2, a7 +; RV64-NEXT: srli a2, a2, 2 +; RV64-NEXT: and a2, a2, a7 +; RV64-NEXT: add a2, a5, a2 +; RV64-NEXT: srli a5, a2, 4 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: and a2, a2, a6 +; RV64-NEXT: slli a5, a2, 8 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a2, 16 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: srli s0, a2, 56 +; RV64-NEXT: .LBB3_6: # %_udiv-special-cases +; RV64-NEXT: ld a5, 0(a1) +; RV64-NEXT: ld a2, 8(a1) +; RV64-NEXT: ld s2, 16(a1) +; RV64-NEXT: or a1, t4, t3 +; RV64-NEXT: addi s1, s0, 128 +; RV64-NEXT: bnez a1, .LBB3_8 +; RV64-NEXT: # %bb.7: # %_udiv-special-cases +; RV64-NEXT: mv t2, s1 +; RV64-NEXT: .LBB3_8: # %_udiv-special-cases +; RV64-NEXT: snez s3, a1 +; RV64-NEXT: srli a1, a2, 1 +; RV64-NEXT: slli t3, s2, 63 +; RV64-NEXT: slli t4, a2, 63 +; RV64-NEXT: or a1, t3, a1 +; RV64-NEXT: srli t3, a5, 1 +; RV64-NEXT: or t6, t3, t4 +; RV64-NEXT: bnez a1, .LBB3_10 +; RV64-NEXT: # %bb.9: # %_udiv-special-cases +; RV64-NEXT: srli t3, t6, 1 +; RV64-NEXT: or t3, t6, t3 +; RV64-NEXT: srli t4, t3, 2 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 4 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 8 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 16 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 32 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: not t3, t3 +; RV64-NEXT: srli t4, t3, 1 +; RV64-NEXT: and t4, t4, t0 +; RV64-NEXT: sub t3, t3, t4 +; RV64-NEXT: and t4, t3, a7 +; RV64-NEXT: srli t3, t3, 2 +; RV64-NEXT: and t3, t3, a7 +; RV64-NEXT: add t3, t4, t3 +; RV64-NEXT: srli t4, t3, 4 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: and t3, t3, a6 +; RV64-NEXT: slli t4, t3, 8 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: slli t4, t3, 16 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: slli t4, t3, 32 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: srli t3, t3, 56 +; RV64-NEXT: addi s4, t3, 64 +; RV64-NEXT: j .LBB3_11 +; RV64-NEXT: .LBB3_10: +; RV64-NEXT: srli t3, a1, 1 +; RV64-NEXT: or t3, a1, t3 +; RV64-NEXT: srli t4, t3, 2 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 4 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 8 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 16 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: srli t4, t3, 32 +; RV64-NEXT: or t3, t3, t4 +; RV64-NEXT: not t3, t3 +; RV64-NEXT: srli t4, t3, 1 +; RV64-NEXT: and t4, t4, t0 +; RV64-NEXT: sub t3, t3, t4 +; RV64-NEXT: and t4, t3, a7 +; RV64-NEXT: srli t3, t3, 2 +; RV64-NEXT: and t3, t3, a7 +; RV64-NEXT: add t3, t4, t3 +; RV64-NEXT: srli t4, t3, 4 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: and t3, t3, a6 +; RV64-NEXT: slli t4, t3, 8 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: slli t4, t3, 16 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: slli t4, t3, 32 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: srli s4, t3, 56 +; RV64-NEXT: .LBB3_11: # %_udiv-special-cases +; RV64-NEXT: andi t4, s2, 1 +; RV64-NEXT: andi t1, t1, 1 +; RV64-NEXT: or t3, a3, a4 +; RV64-NEXT: or s2, a5, a2 +; RV64-NEXT: sltu s0, s1, s0 +; RV64-NEXT: slli s1, a5, 63 +; RV64-NEXT: addi s3, s3, -1 +; RV64-NEXT: beqz s1, .LBB3_13 +; RV64-NEXT: # %bb.12: +; RV64-NEXT: srli t5, s1, 1 +; RV64-NEXT: or t5, s1, t5 +; RV64-NEXT: srli s1, t5, 2 +; RV64-NEXT: or t5, t5, s1 +; RV64-NEXT: srli s1, t5, 4 +; RV64-NEXT: or t5, t5, s1 +; RV64-NEXT: srli s1, t5, 8 +; RV64-NEXT: or t5, t5, s1 +; RV64-NEXT: srli s1, t5, 16 +; RV64-NEXT: or t5, t5, s1 +; RV64-NEXT: srli s1, t5, 32 +; RV64-NEXT: or t5, t5, s1 +; RV64-NEXT: not t5, t5 +; RV64-NEXT: srli s1, t5, 1 +; RV64-NEXT: and t0, s1, t0 +; RV64-NEXT: sub t0, t5, t0 +; RV64-NEXT: and t5, t0, a7 +; RV64-NEXT: srli t0, t0, 2 +; RV64-NEXT: and a7, t0, a7 +; RV64-NEXT: add a7, t5, a7 +; RV64-NEXT: srli t0, a7, 4 +; RV64-NEXT: add a7, a7, t0 +; RV64-NEXT: and a6, a7, a6 +; RV64-NEXT: slli a7, a6, 8 +; RV64-NEXT: add a6, a6, a7 +; RV64-NEXT: slli a7, a6, 16 +; RV64-NEXT: add a6, a6, a7 +; RV64-NEXT: slli a7, a6, 32 +; RV64-NEXT: add a6, a6, a7 +; RV64-NEXT: srli t5, a6, 56 +; RV64-NEXT: .LBB3_13: # %_udiv-special-cases +; RV64-NEXT: or t0, t3, t1 +; RV64-NEXT: or a6, s2, t4 +; RV64-NEXT: and a7, s3, s0 +; RV64-NEXT: or t6, t6, a1 +; RV64-NEXT: addi s0, t5, 128 +; RV64-NEXT: bnez t6, .LBB3_15 +; RV64-NEXT: # %bb.14: # %_udiv-special-cases +; RV64-NEXT: mv s4, s0 +; RV64-NEXT: .LBB3_15: # %_udiv-special-cases +; RV64-NEXT: seqz a1, t0 +; RV64-NEXT: sltu t0, s0, t5 +; RV64-NEXT: snez t5, t6 +; RV64-NEXT: addi t5, t5, -1 +; RV64-NEXT: and t0, t5, t0 +; RV64-NEXT: sltu t5, t2, s4 +; RV64-NEXT: seqz a6, a6 +; RV64-NEXT: mv t6, t5 +; RV64-NEXT: beq a7, t0, .LBB3_17 +; RV64-NEXT: # %bb.16: # %_udiv-special-cases +; RV64-NEXT: sltu t6, a7, t0 +; RV64-NEXT: .LBB3_17: # %_udiv-special-cases +; RV64-NEXT: or a1, a1, a6 +; RV64-NEXT: andi a6, t6, 1 +; RV64-NEXT: sub a7, a7, t0 +; RV64-NEXT: sub t5, a7, t5 +; RV64-NEXT: sub a7, t2, s4 +; RV64-NEXT: beqz a6, .LBB3_19 +; RV64-NEXT: # %bb.18: # %_udiv-special-cases +; RV64-NEXT: mv t0, a6 +; RV64-NEXT: j .LBB3_20 +; RV64-NEXT: .LBB3_19: +; RV64-NEXT: sltiu t0, a7, 129 +; RV64-NEXT: xori t0, t0, 1 +; RV64-NEXT: snez t2, t5 +; RV64-NEXT: or t0, t0, t2 +; RV64-NEXT: .LBB3_20: # %_udiv-special-cases +; RV64-NEXT: or t6, a1, t0 +; RV64-NEXT: addi a1, t6, -1 +; RV64-NEXT: and t2, t4, a1 +; RV64-NEXT: and t0, a1, a2 +; RV64-NEXT: and a1, a1, a5 +; RV64-NEXT: bnez t6, .LBB3_30 +; RV64-NEXT: # %bb.21: # %_udiv-special-cases +; RV64-NEXT: xori t6, a7, 128 +; RV64-NEXT: or t6, t6, a6 +; RV64-NEXT: or t6, t6, t5 +; RV64-NEXT: beqz t6, .LBB3_30 +; RV64-NEXT: # %bb.22: # %udiv-bb1 +; RV64-NEXT: addi a1, a7, 1 +; RV64-NEXT: sd zero, 64(sp) +; RV64-NEXT: sd zero, 72(sp) +; RV64-NEXT: sd zero, 80(sp) +; RV64-NEXT: sd zero, 88(sp) +; RV64-NEXT: sd a5, 96(sp) +; RV64-NEXT: sd a2, 104(sp) +; RV64-NEXT: sd t4, 112(sp) +; RV64-NEXT: li t0, 128 +; RV64-NEXT: addi t2, sp, 96 +; RV64-NEXT: neg s1, a7 +; RV64-NEXT: seqz t6, a1 +; RV64-NEXT: sub a7, t0, a7 +; RV64-NEXT: add t5, t5, t6 +; RV64-NEXT: andi t0, a7, 63 +; RV64-NEXT: srli a7, a7, 3 +; RV64-NEXT: or t6, a1, t5 +; RV64-NEXT: xori s2, t0, 63 +; RV64-NEXT: andi a7, a7, 24 +; RV64-NEXT: seqz t0, t6 +; RV64-NEXT: sub s3, t2, a7 +; RV64-NEXT: add a6, a6, t0 +; RV64-NEXT: ld t2, 0(s3) +; RV64-NEXT: ld s4, 8(s3) +; RV64-NEXT: andi a7, a6, 1 +; RV64-NEXT: or t6, t6, a7 +; RV64-NEXT: srli a6, t2, 1 +; RV64-NEXT: sll t0, s4, s1 +; RV64-NEXT: srl a6, a6, s2 +; RV64-NEXT: or t0, t0, a6 +; RV64-NEXT: sll a6, t2, s1 +; RV64-NEXT: li t2, 0 +; RV64-NEXT: beqz t6, .LBB3_28 +; RV64-NEXT: # %bb.23: # %udiv-preheader +; RV64-NEXT: li t6, 0 +; RV64-NEXT: li s0, 0 +; RV64-NEXT: srli s4, s4, 1 +; RV64-NEXT: ld s3, 16(s3) +; RV64-NEXT: sd zero, 32(sp) +; RV64-NEXT: sd zero, 40(sp) +; RV64-NEXT: sd zero, 48(sp) +; RV64-NEXT: sd zero, 56(sp) +; RV64-NEXT: sd a5, 0(sp) +; RV64-NEXT: sd a2, 8(sp) +; RV64-NEXT: sd t4, 16(sp) +; RV64-NEXT: sd zero, 24(sp) +; RV64-NEXT: srli a2, a1, 3 +; RV64-NEXT: srl a5, s4, s2 +; RV64-NEXT: mv t4, sp +; RV64-NEXT: snez t3, t3 +; RV64-NEXT: andi a2, a2, 24 +; RV64-NEXT: add t1, t1, t3 +; RV64-NEXT: add a2, t4, a2 +; RV64-NEXT: ld t3, 0(a2) +; RV64-NEXT: ld t4, 8(a2) +; RV64-NEXT: ld a2, 16(a2) +; RV64-NEXT: sll s1, s3, s1 +; RV64-NEXT: andi s2, a1, 63 +; RV64-NEXT: xori s2, s2, 63 +; RV64-NEXT: or s3, s1, a5 +; RV64-NEXT: slli a2, a2, 1 +; RV64-NEXT: slli a5, t4, 1 +; RV64-NEXT: sll a2, a2, s2 +; RV64-NEXT: sll s2, a5, s2 +; RV64-NEXT: srl s1, t4, a1 +; RV64-NEXT: or s1, s1, a2 +; RV64-NEXT: seqz a2, a3 +; RV64-NEXT: sub a2, a4, a2 +; RV64-NEXT: addi a5, t1, 1 +; RV64-NEXT: andi a5, a5, 1 +; RV64-NEXT: andi s3, s3, 1 +; RV64-NEXT: srl t1, t3, a1 +; RV64-NEXT: or s2, t1, s2 +; RV64-NEXT: addi t1, a3, -1 +; RV64-NEXT: j .LBB3_26 +; RV64-NEXT: .LBB3_24: # %udiv-do-while +; RV64-NEXT: # in Loop: Header=BB3_26 Depth=1 +; RV64-NEXT: sltu t3, a2, s4 +; RV64-NEXT: .LBB3_25: # %udiv-do-while +; RV64-NEXT: # in Loop: Header=BB3_26 Depth=1 +; RV64-NEXT: srli s1, s1, 63 +; RV64-NEXT: sub t4, a5, s1 +; RV64-NEXT: sub t3, t4, t3 +; RV64-NEXT: slli t3, t3, 63 +; RV64-NEXT: srai s1, t3, 63 +; RV64-NEXT: and s3, s1, a4 +; RV64-NEXT: li t3, 0 +; RV64-NEXT: li t4, 0 +; RV64-NEXT: srli s5, a6, 63 +; RV64-NEXT: sub s4, s4, s3 +; RV64-NEXT: slli s3, t0, 1 +; RV64-NEXT: or s3, s3, s5 +; RV64-NEXT: srli t0, t0, 63 +; RV64-NEXT: slli a6, a6, 1 +; RV64-NEXT: or a6, t2, a6 +; RV64-NEXT: seqz t2, a1 +; RV64-NEXT: or s0, s0, t0 +; RV64-NEXT: or s5, a1, t5 +; RV64-NEXT: sub t5, t5, t2 +; RV64-NEXT: and s6, s1, a3 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: andi t2, s1, 1 +; RV64-NEXT: or t0, t6, s3 +; RV64-NEXT: sltu t6, s2, s6 +; RV64-NEXT: snez s5, s5 +; RV64-NEXT: andi s3, s0, 1 +; RV64-NEXT: sub s1, s4, t6 +; RV64-NEXT: add a7, a7, s5 +; RV64-NEXT: addi a7, a7, 1 +; RV64-NEXT: andi a7, a7, 1 +; RV64-NEXT: or t6, a1, t5 +; RV64-NEXT: or s4, t6, a7 +; RV64-NEXT: sub s2, s2, s6 +; RV64-NEXT: li t6, 0 +; RV64-NEXT: li s0, 0 +; RV64-NEXT: beqz s4, .LBB3_29 +; RV64-NEXT: .LBB3_26: # %udiv-do-while +; RV64-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-NEXT: srli t3, s2, 63 +; RV64-NEXT: slli t4, s1, 1 +; RV64-NEXT: slli s2, s2, 1 +; RV64-NEXT: or s4, t4, t3 +; RV64-NEXT: andi t3, s3, 1 +; RV64-NEXT: or s2, s2, t3 +; RV64-NEXT: bne a2, s4, .LBB3_24 +; RV64-NEXT: # %bb.27: # in Loop: Header=BB3_26 Depth=1 +; RV64-NEXT: sltu t3, t1, s2 +; RV64-NEXT: j .LBB3_25 +; RV64-NEXT: .LBB3_28: +; RV64-NEXT: li t3, 0 +; RV64-NEXT: li t4, 0 +; RV64-NEXT: .LBB3_29: # %udiv-loop-exit +; RV64-NEXT: srli a2, a6, 63 +; RV64-NEXT: slli a3, t0, 1 +; RV64-NEXT: srli a4, t0, 63 +; RV64-NEXT: slli a6, a6, 1 +; RV64-NEXT: or a1, t2, a6 +; RV64-NEXT: or a2, t3, a2 +; RV64-NEXT: or a4, t4, a4 +; RV64-NEXT: or t0, a2, a3 +; RV64-NEXT: andi t2, a4, 1 +; RV64-NEXT: .LBB3_30: # %udiv-end +; RV64-NEXT: andi a2, t2, 1 +; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: sd t0, 8(a0) +; RV64-NEXT: sb a2, 16(a0) +; RV64-NEXT: ld s0, 184(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 176(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 168(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 160(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s4, 152(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s5, 144(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s6, 136(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 192 +; RV64-NEXT: ret %res = udiv i129 %x, %y ret i129 %res } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll index c2b4494..11e7e5c 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll @@ -1,16 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s -; CHECK-LABEL: mul_v16i8 -; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 -; CHECK: vector.body: -; CHECK: %index = phi i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]]) -; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16 -; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) -; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) -; CHECK: tail call void @llvm.masked.store.v16i8.p0(<16 x i8> {{.*}}, ptr {{.*}}, i32 4, <16 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @mul_v16i8(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v16i8( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -16 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 16 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP]], i32 4, <16 x i1> [[TMP1]], <16 x i8> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i8> undef) +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <16 x i8> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v16i8.p0(<16 x i8> [[MUL]], ptr [[TMP6]], i32 4, <16 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 15 @@ -45,17 +70,41 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: mul_v8i16 -; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 -; CHECK: vector.body: -; CHECK: %index = phi i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]]) -; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8 -; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) -; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) -; CHECK: tail call void @llvm.masked.store.v8i16.p0(<8 x i16> {{.*}}, ptr {{.*}}, i32 4, <8 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @mul_v8i16(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v8i16( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 3 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 3 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -8 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef) +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <8 x i16> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v8i16.p0(<8 x i16> [[MUL]], ptr [[TMP6]], i32 4, <8 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 7 @@ -90,16 +139,41 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: mul_v4i32 -; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 -; CHECK: vector.body: -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) -; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 -; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @mul_v4i32(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v4i32( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[MUL]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 @@ -134,17 +208,47 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: split_vector -; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 -; CHECK: vector.body: -; CHECK: %index = phi i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) -; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 -; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @split_vector(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @split_vector( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[EXTRACT_1_LOW:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> undef, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[EXTRACT_1_HIGH:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> undef, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[EXTRACT_2_LOW:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD2]], <4 x i32> undef, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[EXTRACT_2_HIGH:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD2]], <4 x i32> undef, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <2 x i32> [[EXTRACT_1_LOW]], [[EXTRACT_2_LOW]] +; CHECK-NEXT: [[SUB:%.*]] = sub nsw <2 x i32> [[EXTRACT_1_HIGH]], [[EXTRACT_2_HIGH]] +; CHECK-NEXT: [[COMBINE:%.*]] = shufflevector <2 x i32> [[MUL]], <2 x i32> [[SUB]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[COMBINE]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 @@ -186,14 +290,48 @@ for.cond.cleanup: ; preds = %vector.body, %entry } ; One of the loads now uses ult predicate. -; CHECK-LABEL: mismatch_load_pred -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) -; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 -; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef) -; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]]) define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mismatch_load_pred( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[WRONG:%.*]] = icmp ult <4 x i32> [[INDUCTION]], [[BROADCAST_SPLAT11]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[WRONG]], <4 x i32> undef) +; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 @@ -236,17 +374,48 @@ for.cond.cleanup: ; preds = %vector.body, %entry } ; The store now uses ult predicate. -; CHECK-LABEL: mismatch_store_pred -; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 -; CHECK: vector.body: -; CHECK: %index = phi i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) -; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 -; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %wrong) define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mismatch_store_pred( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[WRONG:%.*]] = icmp ult <4 x i32> [[INDUCTION]], [[BROADCAST_SPLAT11]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <4 x i1> [[WRONG]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 @@ -294,14 +463,72 @@ for.cond.cleanup: ; preds = %vector.body, %entry ; ; Step value 16 doesn't match vector width 4 ; -; CHECK-LABEL: interleave4 -; CHECK: vector.body: -; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) -; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N) -; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N) -; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N) -; define dso_local void @interleave4(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @interleave4( +; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: [[V0:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[V1:%.*]] = lshr i32 [[V0]], 4 +; CHECK-NEXT: [[V2:%.*]] = shl nuw i32 [[V1]], 4 +; CHECK-NEXT: [[V3:%.*]] = add i32 [[V2]], -16 +; CHECK-NEXT: [[V4:%.*]] = lshr i32 [[V3]], 4 +; CHECK-NEXT: [[V5:%.*]] = add nuw nsw i32 [[V4]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, ptr [[A]], i32 8 +; CHECK-NEXT: [[SCEVGEP30:%.*]] = getelementptr i32, ptr [[C]], i32 8 +; CHECK-NEXT: [[SCEVGEP37:%.*]] = getelementptr i32, ptr [[B]], i32 8 +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[V5]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[LSR_IV38:%.*]] = phi ptr [ [[SCEVGEP39:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP37]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV31:%.*]] = phi ptr [ [[SCEVGEP32:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP30]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP25:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[V14:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[V6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[V15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[V7:%.*]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK15:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V7]], i32 [[N]]) +; CHECK-NEXT: [[V8:%.*]] = add i32 [[V7]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK16:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V8]], i32 [[N]]) +; CHECK-NEXT: [[V9:%.*]] = add i32 [[V8]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK17:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V9]], i32 [[N]]) +; CHECK-NEXT: [[SCEVGEP42:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 -2 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[SCEVGEP42]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[SCEVGEP43:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 -1 +; CHECK-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP43]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD19:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[LSR_IV38]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef) +; CHECK-NEXT: [[SCEVGEP41:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 1 +; CHECK-NEXT: [[WIDE_MASKED_LOAD20:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP41]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef) +; CHECK-NEXT: [[SCEVGEP34:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 -2 +; CHECK-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[SCEVGEP34]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[SCEVGEP35:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 -1 +; CHECK-NEXT: [[WIDE_MASKED_LOAD22:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP35]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD23:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[LSR_IV31]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef) +; CHECK-NEXT: [[SCEVGEP36:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 1 +; CHECK-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP36]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef) +; CHECK-NEXT: [[V10:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD21]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[V11:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_MASKED_LOAD18]] +; CHECK-NEXT: [[V12:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD19]] +; CHECK-NEXT: [[V13:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD24]], [[WIDE_MASKED_LOAD20]] +; CHECK-NEXT: [[SCEVGEP27:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 -2 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V10]], ptr [[SCEVGEP27]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[SCEVGEP28:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 -1 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V11]], ptr [[SCEVGEP28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK15]]) +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V12]], ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK16]]) +; CHECK-NEXT: [[SCEVGEP29:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 1 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V13]], ptr [[SCEVGEP29]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK17]]) +; CHECK-NEXT: [[SCEVGEP25]] = getelementptr i32, ptr [[LSR_IV]], i32 16 +; CHECK-NEXT: [[SCEVGEP32]] = getelementptr i32, ptr [[LSR_IV31]], i32 16 +; CHECK-NEXT: [[SCEVGEP39]] = getelementptr i32, ptr [[LSR_IV38]], i32 16 +; CHECK-NEXT: [[V14]] = add i32 [[V9]], 4 +; CHECK-NEXT: [[V15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[V6]], i32 1) +; CHECK-NEXT: [[V16:%.*]] = icmp ne i32 [[V15]], 0 +; CHECK-NEXT: br i1 [[V16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp sgt i32 %N, 0 %v0 = add i32 %N, 15 @@ -370,12 +597,42 @@ for.cond.cleanup: ret void } -; CHECK-LABEL: const_expected_in_set_loop -; CHECK: call <4 x i1> @llvm.get.active.lane.mask -; CHECK-NOT: vctp -; CHECK: ret void -; define dso_local void @const_expected_in_set_loop(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @const_expected_in_set_loop( +; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -4 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 42) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV14]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr [[LSR_IV17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 +; CHECK-NEXT: [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4 +; CHECK-NEXT: [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1) +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp sgt i32 %N, 0 %0 = add i32 %N, 3 @@ -413,12 +670,42 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: tripcount_arg_not_invariant -; CHECK: call <4 x i1> @llvm.get.active.lane.mask -; CHECK-NOT: vctp -; CHECK: ret void -; define dso_local void @tripcount_arg_not_invariant(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @tripcount_arg_not_invariant( +; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -4 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[INDEX]]) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV14]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr [[LSR_IV17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 +; CHECK-NEXT: [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4 +; CHECK-NEXT: [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1) +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[VECTOR_PH]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp sgt i32 %N, 0 %0 = add i32 %N, 3 @@ -458,12 +745,42 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: addrec_base_not_zero -; CHECK: call <4 x i1> @llvm.get.active.lane.mask -; CHECK-NOT: vctp -; CHECK: ret void -; define dso_local void @addrec_base_not_zero(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @addrec_base_not_zero( +; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -4 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 1, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV14]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr [[LSR_IV17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 +; CHECK-NEXT: [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4 +; CHECK-NEXT: [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1) +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[VECTOR_PH]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp sgt i32 %N, 0 %0 = add i32 %N, 3 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll index fa6a66b..9775cf9 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll @@ -1,15 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name INST --version 6 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s -; CHECK-LABEL: reduction_i32 -; CHECK: phi i32 [ 0, %vector.ph ] -; CHECK: phi <8 x i16> [ zeroinitializer, %vector.ph ] -; CHECK: phi i32 -; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) -; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 -; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) -; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp5, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B, i32 %N) { +; CHECK-LABEL: define i16 @reduction_i32( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC]], -8 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]]) +; CHECK-NEXT: [[TMP6]] = sub i32 [[TMP4]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef) +; CHECK-NEXT: [[INSTTMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP5]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef) +; CHECK-NEXT: [[TMP7:%.*]] = add <8 x i16> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8]] = add <8 x i16> [[TMP7]], [[WIDE_MASKED_LOAD3]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP9]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1) +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP9]], 0 +; CHECK-NEXT: br i1 [[TMP12]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[VEC_PHI_LCSSA:%.*]] = phi <8 x i16> [ [[VEC_PHI]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA3:%.*]] = phi <8 x i1> [ [[TMP5]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <8 x i16> [ [[TMP8]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[DOTLCSSA3]], <8 x i16> [[DOTLCSSA]], <8 x i16> [[VEC_PHI_LCSSA]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[TMP10]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[TMP10]] +; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <8 x i16> [[RDX_SHUF4]], [[BIN_RDX]] +; CHECK-NEXT: [[RDX_SHUF6:%.*]] = shufflevector <8 x i16> [[BIN_RDX5]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <8 x i16> [[RDX_SHUF6]], [[BIN_RDX5]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[BIN_RDX7]], i32 0 +; CHECK-NEXT: ret i16 [[TMP11]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[RES_0:%.*]] = phi i16 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: ret i16 [[RES_0]] +; entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph @@ -59,16 +99,52 @@ for.cond.cleanup: ret i16 %res.0 } -; CHECK-LABEL: reduction_i32_with_scalar -; CHECK: vector.body: -; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %{{.*}}, %vector.body ] -; CHECK: %{{.*}} = phi i32 [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] -; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) -; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 -; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32_with_scalar(ptr nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { +; CHECK-LABEL: define i16 @reduction_i32_with_scalar( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], i16 [[B:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i32 [[TMP]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT3]], <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC]], -8 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[INSTTMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]]) +; CHECK-NEXT: [[TMP6]] = sub i32 [[TMP4]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef) +; CHECK-NEXT: [[INSTTMP5:%.*]] = add <8 x i16> [[VEC_PHI]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[INSTTMP6]] = add <8 x i16> [[INSTTMP5]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP7]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1) +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[INSTTMP8:%.*]] = select <8 x i1> [[TMP5]], <8 x i16> [[INSTTMP6]], <8 x i16> [[VEC_PHI]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[INSTTMP8]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[INSTTMP8]] +; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <8 x i16> [[RDX_SHUF5]], [[BIN_RDX]] +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x i16> [[BIN_RDX6]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <8 x i16> [[RDX_SHUF7]], [[BIN_RDX6]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[BIN_RDX8]], i32 0 +; CHECK-NEXT: ret i16 [[TMP9]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[RES_0:%.*]] = phi i16 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: ret i16 [[RES_0]] +; entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph @@ -119,15 +195,46 @@ for.cond.cleanup: ; despite this we can still calculate a precise enough range so that the ; the overflow checks for get.active.active.lane.mask don't reject ; tail-predication. -; -; CHECK-LABEL: @reduction_not_guarded -; -; CHECK: vector.body: -; CHECK: @llvm.arm.mve.vctp -; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32 -; CHECK: ret -; define i16 @reduction_not_guarded(ptr nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { +; CHECK-LABEL: define i16 @reduction_not_guarded( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], i16 [[B:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i32 [[TMP]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT3]], <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC]], -8 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[INSTTMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]]) +; CHECK-NEXT: [[TMP6]] = sub i32 [[TMP4]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef) +; CHECK-NEXT: [[INSTTMP5:%.*]] = add <8 x i16> [[VEC_PHI]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[INSTTMP6]] = add <8 x i16> [[INSTTMP5]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP7]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1) +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[INSTTMP8:%.*]] = select <8 x i1> [[TMP5]], <8 x i16> [[INSTTMP6]], <8 x i16> [[VEC_PHI]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[INSTTMP8]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[INSTTMP8]] +; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <8 x i16> [[RDX_SHUF5]], [[BIN_RDX]] +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x i16> [[BIN_RDX6]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <8 x i16> [[RDX_SHUF7]], [[BIN_RDX6]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[BIN_RDX8]], i32 0 +; CHECK-NEXT: ret i16 [[TMP9]] +; entry: %tmp = add i32 %N, -1 %n.rnd.up = add nuw nsw i32 %tmp, 8 @@ -166,12 +273,76 @@ middle.block: ; preds = %vector.body ret i16 %tmp9 } -; CHECK-LABEL: @Correlation -; CHECK: vector.body: -; CHECK: @llvm.arm.mve.vctp -; CHECK-NOT: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask -; define dso_local void @Correlation(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @Correlation( +; CHECK-SAME: ptr readonly captures(none) [[INPUT:%.*]], ptr captures(none) [[OUTPUT:%.*]], i16 signext [[SIZE:%.*]], i16 signext [[N:%.*]], i16 signext [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[N]] to i32 +; CHECK-NEXT: [[CMP36:%.*]] = icmp sgt i16 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP36]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END17:.*]] +; CHECK: [[FOR_BODY_LR_PH]]: +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[SIZE]] to i32 +; CHECK-NEXT: [[CONV1032:%.*]] = zext i16 [[SCALE]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[CONV2]], 3 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[LSR_IV51:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], %[[FOR_END:.*]] ], [ [[TMP0]], %[[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[LSR_IV46:%.*]] = phi ptr [ [[SCEVGEP47:%.*]], %[[FOR_END]] ], [ [[INPUT]], %[[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[I_037:%.*]] = phi i32 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], %[[FOR_END]] ] +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i32 [[I_037]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[CONV2]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw i32 [[I_037]], -1 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], -4 +; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i32 [[TMP8]], 1 +; CHECK-NEXT: [[CMP433:%.*]] = icmp slt i32 [[I_037]], [[CONV2]] +; CHECK-NEXT: br i1 [[CMP433]], label %[[VECTOR_PH:.*]], label %[[FOR_END]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP9]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[LSR_IV48:%.*]] = phi ptr [ [[SCEVGEP49:%.*]], %[[VECTOR_BODY]] ], [ [[LSR_IV46]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[INPUT]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi i32 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP11]]) +; CHECK-NEXT: [[TMP13]] = sub i32 [[TMP11]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[LSR_IV]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef) +; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[LSR_IV48]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef) +; CHECK-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = ashr <4 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP20]] = add <4 x i32> [[TMP19]], [[VEC_PHI]] +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i16, ptr [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP49]] = getelementptr i16, ptr [[LSR_IV48]], i32 4 +; CHECK-NEXT: [[TMP21]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP10]], i32 1) +; CHECK-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> [[TMP20]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP23]]) +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[FOR_BODY]] ], [ [[TMP24]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP25:%.*]] = lshr i32 [[SUM_0_LCSSA]], 16 +; CHECK-NEXT: [[CONV13:%.*]] = trunc i32 [[TMP25]] to i16 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[OUTPUT]], i32 [[I_037]] +; CHECK-NEXT: store i16 [[CONV13]], ptr [[ARRAYIDX14]], align 2 +; CHECK-NEXT: [[INC16]] = add nuw nsw i32 [[I_037]], 1 +; CHECK-NEXT: [[SCEVGEP47]] = getelementptr i16, ptr [[LSR_IV46]], i32 1 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV51]], -1 +; CHECK-NEXT: [[EXITCOND39:%.*]] = icmp eq i32 [[INC16]], [[CONV]] +; CHECK-NEXT: br i1 [[EXITCOND39]], label %[[FOR_END17]], label %[[FOR_BODY]] +; CHECK: [[FOR_END17]]: +; CHECK-NEXT: ret void +; entry: %conv = sext i16 %N to i32 %cmp36 = icmp sgt i16 %N, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll index a8ad360..b54d526 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll @@ -1,8 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s -; CHECK-LABEL: expand_v8i16_v8i32 -; CHECK-NOT: call i32 @llvm.arm.mve.vctp define void @expand_v8i16_v8i32(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define void @expand_v8i16_v8i32( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 3 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 3 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -8 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef) +; CHECK-NEXT: [[EXPAND_1:%.*]] = zext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> +; CHECK-NEXT: [[EXPAND_2:%.*]] = zext <8 x i16> [[WIDE_MASKED_LOAD2]] to <8 x i32> +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <8 x i32> [[EXPAND_2]], [[EXPAND_1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[MUL]], ptr [[TMP6]], i32 4, <8 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 7 @@ -39,15 +74,57 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: expand_v8i16_v4i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]]) -; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8 -; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) -; CHECK: %store.pred = icmp ule <4 x i32> %induction.store -; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %store.pred) -; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %store.pred) define void @expand_v8i16_v4i32(ptr readonly %a, ptr readonly %b, ptr %c, ptr %d, i32 %N) { +; CHECK-LABEL: define void @expand_v8i16_v4i32( +; CHECK-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 3 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 3 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -8 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT10_STORE:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT11_STORE:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10_STORE]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STORE_IDX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[STORE_IDX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef) +; CHECK-NEXT: [[EXTRACT_2_LOW:%.*]] = shufflevector <8 x i16> [[WIDE_MASKED_LOAD2]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[EXTRACT_2_HIGH:%.*]] = shufflevector <8 x i16> [[WIDE_MASKED_LOAD2]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[EXPAND_1:%.*]] = zext <4 x i16> [[EXTRACT_2_LOW]] to <4 x i32> +; CHECK-NEXT: [[EXPAND_2:%.*]] = zext <4 x i16> [[EXTRACT_2_HIGH]] to <4 x i32> +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <4 x i32> [[EXPAND_2]], [[EXPAND_1]] +; CHECK-NEXT: [[SUB:%.*]] = mul nsw <4 x i32> [[EXPAND_1]], [[EXPAND_2]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT_STORE:%.*]] = insertelement <4 x i32> undef, i32 [[STORE_IDX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT_STORE:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT_STORE]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION_STORE:%.*]] = add <4 x i32> [[BROADCAST_SPLAT_STORE]], <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[STORE_PRED:%.*]] = icmp ule <4 x i32> [[INDUCTION_STORE]], [[BROADCAST_SPLAT11_STORE]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[STORE_IDX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[MUL]], ptr [[TMP6]], i32 4, <4 x i1> [[STORE_PRED]]) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[D]], i32 [[STORE_IDX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[SUB]], ptr [[GEP]], i32 4, <4 x i1> [[STORE_PRED]]) +; CHECK-NEXT: [[STORE_IDX_NEXT]] = add i32 [[STORE_IDX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 7 @@ -98,9 +175,43 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: expand_v4i32_v4i64 -; CHECK-NOT: call i32 @llvm.arm.mve.vctp define void @expand_v4i32_v4i64(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { +; CHECK-LABEL: define void @expand_v4i32_v4i64( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[EXPAND_1:%.*]] = zext <4 x i32> [[WIDE_MASKED_LOAD]] to <4 x i64> +; CHECK-NEXT: [[EXPAND_2:%.*]] = zext <4 x i32> [[WIDE_MASKED_LOAD2]] to <4 x i64> +; CHECK-NEXT: [[MUL:%.*]] = mul nsw <4 x i64> [[EXPAND_2]], [[EXPAND_1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[MUL]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp eq i32 %N, 0 %tmp8 = add i32 %N, 3 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll index ec542df..fb1a4a4 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -1,24 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s -; CHECK-LABEL: vec_mul_reduce_add - -; CHECK: vector.ph: -; CHECK: %start = call i32 @llvm.start.loop.iterations.i32 -; CHECK: br label %vector.body - -; CHECK: vector.body: -; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]]) -; CHECK: [[SUB]] = sub i32 [[ELTS]], 4 -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]] -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], - -; CHECK: middle.block: -; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP]], -; CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]]) - define i32 @vec_mul_reduce_add(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) { +; CHECK-LABEL: define i32 @vec_mul_reduce_add( +; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -4 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]]) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[LSR_IV2:%.*]] = phi ptr [ [[SCEVGEP3:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP7]]) +; CHECK-NEXT: [[TMP9]] = sub i32 [[TMP7]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV2]], i32 4, <4 x i1> [[TMP8]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP8]], <4 x i32> undef) +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP11]] = add nsw <4 x i32> [[TMP10]], [[VEC_PHI]] +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP3]] = getelementptr i32, ptr [[LSR_IV2]], i32 4 +; CHECK-NEXT: [[TMP12]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1) +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0 +; CHECK-NEXT: br i1 [[TMP13]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] +; entry: %cmp8 = icmp eq i32 %N, 0 %0 = add i32 %N, 3 diff --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll index 04a2268..314e1b4 100644 --- a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll +++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll @@ -1,5 +1,6 @@ ; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s ; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH +; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH target triple = "wasm32" @@ -23,6 +24,10 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n ; MAX-BANDWIDTH: i32x4.add ; MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s + entry: %cmp7.not = icmp eq i32 %N, 0 br i1 %cmp7.not, label %for.cond.cleanup, label %for.body @@ -47,6 +52,109 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } +define hidden i32 @i32_mac_u8_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: i32_mac_u8_s8: +; CHECK: loop +; CHECK: v128.load32_zero +; CHECK: i16x8.extend_low_i8x16_u +; CHECK: i32x4.extend_low_i16x8_u +; CHECK: v128.load32_zero +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: i32x4.mul +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.mul +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.mul +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.mul +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.mul +; MAX-BANDWIDTH: i32x4.add + +; RELAXED-MAX-BANDWIDTH: loop +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i32x4.add +entry: + %cmp7.not = icmp eq i32 %N, 0 + br i1 %cmp7.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sext i8 %0 to i32 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09 + %1 = load i8, ptr %arrayidx1, align 1 + %conv2 = zext i8 %1 to i32 + %mul = mul nsw i32 %conv2, %conv + %add = add nsw i32 %mul, %res.08 + %inc = add nuw i32 %i.09, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { ; CHECK-LABEL: i32_mac_s16: ; CHECK: i32x4.load16x4_s 0:p2align=1 @@ -57,6 +165,12 @@ define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture ; MAX-BANDWIDTH: v128.load ; MAX-BANDWIDTH: v128.load ; MAX-BANDWIDTH: i32x4.dot_i16x8_s +; MAX-BANDWIDTH: i32x4.add + +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.add entry: %cmp7.not = icmp eq i32 %N, 0 @@ -116,6 +230,31 @@ define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s ; MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s +; RELAXED-MAX-BANDWIDTH: i64x2.add + entry: %cmp7.not = icmp eq i32 %N, 0 br i1 %cmp7.not, label %for.cond.cleanup, label %for.body @@ -156,6 +295,14 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s ; MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_s +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_s +; RELAXED-MAX-BANDWIDTH: i64x2.add + entry: %cmp6.not = icmp eq i32 %N, 0 br i1 %cmp6.not, label %for.cond.cleanup, label %for.body @@ -197,6 +344,15 @@ define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture n ; MAX-BANDWIDTH: i32x4.add ; MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u +; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u +; RELAXED-MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u +; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u +; RELAXED-MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: i32x4.add + entry: %cmp7.not = icmp eq i32 %N, 0 br i1 %cmp7.not, label %for.cond.cleanup, label %for.body @@ -235,6 +391,13 @@ define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture ; MAX-BANDWIDTH: i32x4.add ; MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u +; RELAXED-MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: i32x4.add + entry: %cmp7.not = icmp eq i32 %N, 0 br i1 %cmp7.not, label %for.cond.cleanup, label %for.body @@ -277,6 +440,17 @@ define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapt ; MAX-BANDWIDTH: i32x4.add ; MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i32x4.add +; RELAXED-MAX-BANDWIDTH: i32x4.add + entry: %cmp7.not = icmp eq i32 %N, 0 br i1 %cmp7.not, label %for.cond.cleanup, label %for.body @@ -335,6 +509,32 @@ define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u ; MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u +; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u +; RELAXED-MAX-BANDWIDTH: i64x2.add + + entry: %cmp8.not = icmp eq i32 %N, 0 br i1 %cmp8.not, label %for.cond.cleanup, label %for.body @@ -375,6 +575,14 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u ; MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: v128.load +; RELAXED-MAX-BANDWIDTH: i32x4.mul +; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_u +; RELAXED-MAX-BANDWIDTH: i64x2.add +; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_u +; RELAXED-MAX-BANDWIDTH: i64x2.add + entry: %cmp6.not = icmp eq i32 %N, 0 br i1 %cmp6.not, label %for.cond.cleanup, label %for.body diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll new file mode 100644 index 0000000..9716cbe --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s + +target triple = "wasm32" +; relaxed_dot stands for relaxed_dot_i8x16_i7x16_s, as in td +; relaxed_dot_add stands for i32x4.relaxed_dot_i8x16_i7x16_add_s, as in td + +define <8 x i16> @relaxed_dot_sext_1(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: relaxed_dot_sext_1: +; CHECK: .functype relaxed_dot_sext_1 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $1 +; CHECK-NEXT: return $pop0 + %sext1 = sext <16 x i8> %a to <16 x i16> + %sext2 = sext <16 x i8> %b to <16 x i16> + %mul = mul <16 x i16> %sext1, %sext2 + %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %res = add <8 x i16> %shuffle1, %shuffle2 + ret <8 x i16> %res +} + + +define <8 x i16> @relaxed_dot_sext_2(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: relaxed_dot_sext_2: +; CHECK: .functype relaxed_dot_sext_2 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $1 +; CHECK-NEXT: return $pop0 + %sext1 = sext <16 x i8> %a to <16 x i16> + %sext2 = sext <16 x i8> %b to <16 x i16> + %mul = mul <16 x i16> %sext1, %sext2 + %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %res = add <8 x i16> %shuffle2, %shuffle1 + ret <8 x i16> %res +} + +define <8 x i16> @relaxed_dot_sext_self(<16 x i8> %v) { +; CHECK-LABEL: relaxed_dot_sext_self: +; CHECK: .functype relaxed_dot_sext_self (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $0 +; CHECK-NEXT: return $pop0 + %sext = sext <16 x i8> %v to <16 x i16> + %mul = mul <16 x i16> %sext, %sext + %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %res = add <8 x i16> %shuffle1, %shuffle2 + ret <8 x i16> %res +} + +define <4 x i32> @relaxed_dot_add_from_relaxed_dot(<16 x i8> %a, <16 x i8> %b, <4 x i32> %c) { +; CHECK-LABEL: relaxed_dot_add_from_relaxed_dot: +; CHECK: .functype relaxed_dot_add_from_relaxed_dot (v128, v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s $push0=, $0, $1, $2 +; CHECK-NEXT: return $pop0 + %relaxed_dot_call = call <8 x i16> @llvm.wasm.relaxed.dot.i8x16.i7x16.signed(<16 x i8> %a, <16 x i8> %b) + %sext = call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> %relaxed_dot_call) + %res = add <4 x i32> %sext, %c + ret <4 x i32> %res +} + +; INFO: Negative test +define <8 x i16> @relaxed_dot_zext(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: relaxed_dot_zext: +; CHECK: .functype relaxed_dot_zext (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i16x8.extmul_low_i8x16_u $push6=, $0, $1 +; CHECK-NEXT: local.tee $push5=, $2=, $pop6 +; CHECK-NEXT: i16x8.extmul_high_i8x16_u $push4=, $0, $1 +; CHECK-NEXT: local.tee $push3=, $1=, $pop4 +; CHECK-NEXT: i8x16.shuffle $push1=, $pop5, $pop3, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 +; CHECK-NEXT: i8x16.shuffle $push0=, $2, $1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 +; CHECK-NEXT: i16x8.add $push2=, $pop1, $pop0 +; CHECK-NEXT: return $pop2 + %zext1 = zext <16 x i8> %a to <16 x i16> + %zext2 = zext <16 x i8> %b to <16 x i16> + %mul = mul <16 x i16> %zext1, %zext2 + %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %res = add <8 x i16> %shuffle1, %shuffle2 + ret <8 x i16> %res + +} + +; INFO: Negative test +define <8 x i16> @relaxed_dot_wrong_shuffle(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: relaxed_dot_wrong_shuffle: +; CHECK: .functype relaxed_dot_wrong_shuffle (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i16x8.extmul_low_i8x16_s $push1=, $0, $1 +; CHECK-NEXT: i16x8.extmul_high_i8x16_s $push0=, $0, $1 +; CHECK-NEXT: i16x8.add $push2=, $pop1, $pop0 +; CHECK-NEXT: return $pop2 + %sext1 = sext <16 x i8> %a to <16 x i16> + %sext2 = sext <16 x i8> %b to <16 x i16> + %mul = mul <16 x i16> %sext1, %sext2 + %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %res = add <8 x i16> %shuffle1, %shuffle2 + ret <8 x i16> %res +} diff --git a/llvm/test/CodeGen/X86/absolute-symbol-kernel-code-model.ll b/llvm/test/CodeGen/X86/absolute-symbol-kernel-code-model.ll new file mode 100644 index 0000000..ce7024d --- /dev/null +++ b/llvm/test/CodeGen/X86/absolute-symbol-kernel-code-model.ll @@ -0,0 +1,34 @@ +; RUN: llc --code-model=kernel < %s -asm-verbose=0 | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: func_no_abs_sym +define i64 @func_no_abs_sym() nounwind { + ; CHECK: movq $no_abs_sym, %rax + %1 = ptrtoint ptr @no_abs_sym to i64 + ret i64 %1 +} + +; CHECK-LABEL: func_abs_sym +define i64 @func_abs_sym() nounwind { + ; CHECK: movabsq $abs_sym, %rax + %1 = ptrtoint ptr @abs_sym to i64 + ret i64 %1 +} + +; CHECK-LABEL: func_abs_sym_in_range +define i64 @func_abs_sym_in_range() nounwind { + ;; The absolute_symbol range fits in 32 bits but we still use movabs + ;; since there's no benefit to using the sign extending instruction + ;; with absolute symbols. + ; CHECK: movabsq $abs_sym_in_range, %rax + %1 = ptrtoint ptr @abs_sym_in_range to i64 + ret i64 %1 +} + +@no_abs_sym = external hidden global [0 x i8] +@abs_sym = external hidden global [0 x i8], !absolute_symbol !0 +@abs_sym_in_range = external hidden global [0 x i8], !absolute_symbol !1 + +!0 = !{i64 -1, i64 -1} ;; Full range +!1 = !{i64 -2147483648, i64 2147483648} ;; In range diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll index af9d944..de9caa5 100644 --- a/llvm/test/CodeGen/X86/apx/cf.ll +++ b/llvm/test/CodeGen/X86/apx/cf.ll @@ -235,9 +235,10 @@ define void @and_cond(i32 %a, i1 %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: setg %al +; CHECK-NEXT: notb %sil ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testb %al, %sil -; CHECK-NEXT: cfcmovel %ecx, 0 +; CHECK-NEXT: cfcmovnel %ecx, 0 ; CHECK-NEXT: retq %is_pos = icmp sgt i32 %a, 0 %not_b = xor i1 %b, true diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll index 2aea9c1..632d90d 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll @@ -27,7 +27,7 @@ entry: !1 = !{i64 0, !"_ZTSFivE.generalized"} !2 = !{i64 0, !"_ZTSFviE.generalized"} -; CHECK: .section .callgraph,"o",@progbits,.text +; CHECK: .section .llvm.callgraph,"o",@progbits,.text ;; Version ; CHECK-NEXT: .byte 0 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0. diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll index 1aabf66..ed6849a 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll @@ -1,8 +1,8 @@ ;; Test if temporary labels are generated for each indirect callsite. -;; Test if the .callgraph section contains the MD5 hash of callees' type (type id) +;; Test if the .llvm.callgraph section contains the MD5 hash of callees' type (type id) ;; is correctly paired with its corresponding temporary label generated for indirect ;; call sites annotated with !callee_type metadata. -;; Test if the .callgraph section contains unique direct callees. +;; Test if the .llvm.callgraph section contains unique direct callees. ; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -o - < %s | FileCheck %s @@ -36,7 +36,7 @@ entry: !4 = !{!5} !5 = !{i64 0, !"_ZTSFPvS_E.generalized"} -; CHECK: .section .callgraph,"o",@progbits,.text +; CHECK: .section .llvm.callgraph,"o",@progbits,.text ;; Version ; CHECK-NEXT: .byte 0 ;; Flags diff --git a/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll b/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll index 34dc5b8..49cc335 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll @@ -1,7 +1,10 @@ -;; Tests that we store the type identifiers in .callgraph section of the object file for tailcalls. +;; Tests that we store the type identifiers in .llvm.callgraph section of the object file for tailcalls. + +; REQUIRES: x86-registered-target +; REQUIRES: arm-registered-target ; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -filetype=obj -o - < %s | \ -; RUN: llvm-readelf -x .callgraph - | FileCheck %s +; RUN: llvm-readelf -x .llvm.callgraph - | FileCheck %s define i32 @check_tailcall(ptr %func, i8 %x) !type !0 { entry: @@ -27,7 +30,7 @@ declare !type !2 i32 @bar(i8 signext) !2 = !{i64 0, !"_ZTSFicE.generalized"} !3 = !{i64 0, !"_ZTSFiiE.generalized"} -; CHECK: Hex dump of section '.callgraph': +; CHECK: Hex dump of section '.llvm.callgraph': ; CHECK-NEXT: 0x00000000 00050000 00000000 00008e19 0b7f3326 ; CHECK-NEXT: 0x00000010 e3000154 86bc5981 4b8e3000 05000000 ;; Verify that the type id 0x308e4b8159bc8654 is in section. diff --git a/llvm/test/CodeGen/X86/call-graph-section.ll b/llvm/test/CodeGen/X86/call-graph-section.ll index c144a24..8a1c6ca 100644 --- a/llvm/test/CodeGen/X86/call-graph-section.ll +++ b/llvm/test/CodeGen/X86/call-graph-section.ll @@ -1,7 +1,10 @@ -;; Tests that we store the type identifiers in .callgraph section of the object file. +;; Tests that we store the type identifiers in .llvm.callgraph section of the object file. + +; REQUIRES: x86-registered-target +; REQUIRES: arm-registered-target ; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -filetype=obj -o - < %s | \ -; RUN: llvm-readelf -x .callgraph - | FileCheck %s +; RUN: llvm-readelf -x .llvm.callgraph - | FileCheck %s declare !type !0 void @foo() @@ -31,7 +34,7 @@ entry: ;; Make sure following type IDs are in call graph section ;; 0x5eecb3e2444f731f, 0x814b8e305486bc59, 0xf897fd777ade6814 -; CHECK: Hex dump of section '.callgraph': +; CHECK: Hex dump of section '.llvm.callgraph': ; CHECK-NEXT: 0x00000000 00050000 00000000 00000000 00000000 ; CHECK-NEXT: 0x00000010 00000324 44f731f5 eecb3e54 86bc5981 ; CHECK-NEXT: 0x00000020 4b8e307a de6814f8 97fd77 diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index 5571519..c90344b8 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -502,11 +502,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2] ; SSE2-NEXT: psubw %xmm1, %xmm0 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,0,0] ; SSE2-NEXT: paddw %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0] ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,16,0,8,8,0,0,0,0,0,2,0,2,0,0,0] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -517,7 +517,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2] ; SSE41-NEXT: psubw %xmm1, %xmm0 -; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,0,0] ; SSE41-NEXT: paddw %xmm1, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4096,2048,8,u,u,2,2,u] ; SSE41-NEXT: pmulhuw %xmm0, %xmm1 @@ -530,7 +530,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2] ; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,0,0] ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4096,2048,8,u,u,2,2,u] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7] @@ -541,7 +541,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2] ; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,0,0] ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq @@ -630,7 +630,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [171,0,0,0] ; SSE2-NEXT: psrlw $15, %xmm0 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 @@ -641,7 +641,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [171,0,0,0] ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm2 ; SSE41-NEXT: psrlw $7, %xmm2 @@ -654,7 +654,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; AVX-LABEL: combine_vec_udiv_nonuniform4: ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [171,0,0,0] ; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1 @@ -691,7 +691,7 @@ define <8 x i16> @pr38477(<8 x i16> %a0) { ; SSE2-NEXT: psubw %xmm3, %xmm0 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,32768,0,0,0,0,0,32768] ; SSE2-NEXT: paddw %xmm3, %xmm0 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,u,0,4,0,4,16,0,4,0,0,4,0,0,0,16] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/cpus-intel.ll b/llvm/test/CodeGen/X86/cpus-intel.ll index 71253c8..646629d 100644 --- a/llvm/test/CodeGen/X86/cpus-intel.ll +++ b/llvm/test/CodeGen/X86/cpus-intel.ll @@ -39,6 +39,7 @@ ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=gracemont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pantherlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=wildcatlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=novalake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=clearwaterforest 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=diamondrapids 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty @@ -106,6 +107,7 @@ ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=gracemont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pantherlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=wildcatlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=novalake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=clearwaterforest 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=diamondrapids 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty diff --git a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll index f3950b7..b2b0a6d 100644 --- a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll +++ b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll @@ -1,17 +1,101 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -;; A minimal test case. Subsequent PRs will expand on this test case -;; (e.g., with more functions, variables and profiles) and test the hotness -;; reconcillation implementation. +;; Requires asserts for -debug-only. +; REQUIRES: asserts + +; RUN: rm -rf %t && split-file %s %t && cd %t + +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \ +; RUN: -partition-static-data-sections=true \ +; RUN: -debug-only=static-data-profile-info \ +; RUN: -data-sections=true -unique-section-names=false \ +; RUN: input-with-data-access-prof-on.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR + ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \ ; RUN: -partition-static-data-sections=true \ +; RUN: -debug-only=static-data-profile-info \ ; RUN: -data-sections=true -unique-section-names=false \ -; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=IR +; RUN: input-with-data-access-prof-off.ll -o - 2>&1 | FileCheck %s --check-prefixes=OFF + +; LOG: hot_bss has section prefix hot, the max from data access profiles as hot and PGO counters as hot +; LOG: data_unknown_hotness has section prefix <empty>, the max from data access profiles as <empty> and PGO counters as unlikely +; LOG: external_relro_array has section prefix unlikely, solely from data access profiles + +; IR: .type hot_bss,@object +; IR-NEXT: .section .bss.hot.,"aw" +; IR: .type data_unknown_hotness,@object +; IR-NEXT: .section .data,"aw" +; IR: .type external_relro_array,@object +; IR-NEXT: .section .data.rel.ro.unlikely.,"aw" + + +; OFF: .type hot_bss,@object +; OFF-NEXT: .section .bss.hot.,"aw" +; OFF: .type data_unknown_hotness,@object +; OFF-NEXT: .section .data.unlikely.,"aw" +;; Global variable section prefix metadata is not used when +;; module flag `EnableDataAccessProf` is 0, and @external_relro_array has +;; external linkage, so analysis based on PGO counters doesn't apply. +; OFF: .type external_relro_array,@object # @external_relro_array +; OFF-NEXT: .section .data.rel.ro,"aw" + +;--- input-with-data-access-prof-on.ll +; Internal vars +@hot_bss = internal global i32 0, !section_prefix !17 +@data_unknown_hotness = internal global i32 1 +; External vars +@external_relro_array = constant [2 x ptr] [ptr @hot_bss, ptr @data_unknown_hotness], !section_prefix !18 + +define void @cold_func() !prof !15 { + %9 = load i32, ptr @data_unknown_hotness + %11 = call i32 (...) @func_taking_arbitrary_param(i32 %9) + ret void +} + +define void @hot_func() !prof !14 { + %9 = load i32, ptr @hot_bss + %11 = call i32 (...) @func_taking_arbitrary_param(i32 %9) + ret void +} + +declare i32 @func_taking_arbitrary_param(...) -; IR: .section .bss.hot.,"aw" +!llvm.module.flags = !{!0, !1} +!0 = !{i32 2, !"EnableDataAccessProf", i32 1} +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 1460183} +!5 = !{!"MaxCount", i64 849024} +!6 = !{!"MaxInternalCount", i64 32769} +!7 = !{!"MaxFunctionCount", i64 849024} +!8 = !{!"NumCounts", i64 23627} +!9 = !{!"NumFunctions", i64 3271} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13} +!12 = !{i32 990000, i64 166, i32 73} +!13 = !{i32 999999, i64 3, i32 1443} +!14 = !{!"function_entry_count", i64 100000} +!15 = !{!"function_entry_count", i64 1} +!16 = !{!"branch_weights", i32 1, i32 99999} +!17 = !{!"section_prefix", !"hot"} +!18 = !{!"section_prefix", !"unlikely"} + +;--- input-with-data-access-prof-off.ll +; Same as file above except that module flag `EnableDataAccessProf` has value 0. +; Internal vars @hot_bss = internal global i32 0, !section_prefix !17 +@data_unknown_hotness = internal global i32 1 +; External vars +@external_relro_array = constant [2 x ptr] [ptr @hot_bss, ptr @data_unknown_hotness], !section_prefix !18 + +define void @cold_func() !prof !15 { + %9 = load i32, ptr @data_unknown_hotness + %11 = call i32 (...) @func_taking_arbitrary_param(i32 %9) + ret void +} define void @hot_func() !prof !14 { %9 = load i32, ptr @hot_bss @@ -21,8 +105,9 @@ define void @hot_func() !prof !14 { declare i32 @func_taking_arbitrary_param(...) -!llvm.module.flags = !{!1} +!llvm.module.flags = !{!0, !1} +!0 = !{i32 2, !"EnableDataAccessProf", i32 0} !1 = !{i32 1, !"ProfileSummary", !2} !2 = !{!3, !4, !5, !6, !7, !8, !9, !10} !3 = !{!"ProfileFormat", !"InstrProf"} @@ -40,3 +125,4 @@ declare i32 @func_taking_arbitrary_param(...) !15 = !{!"function_entry_count", i64 1} !16 = !{!"branch_weights", i32 1, i32 99999} !17 = !{!"section_prefix", !"hot"} +!18 = !{!"section_prefix", !"unlikely"} diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll index c2b7068..df04b67 100644 --- a/llvm/test/CodeGen/X86/isel-fpclass.ll +++ b/llvm/test/CodeGen/X86/isel-fpclass.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86,X86-SDAGISEL +; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL ; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X86-FASTISEL ; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X64,X64-FASTISEL -; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=1 | FileCheck %s -check-prefixes=X86,X86-GISEL -; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=1 | FileCheck %s -check-prefixes=X64-GISEL +; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=2 | FileCheck %s -check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=2 | FileCheck %s -check-prefixes=X64,X64-GISEL define i1 @isnone_f(float %x) nounwind { ; X86-LABEL: isnone_f: @@ -23,11 +23,6 @@ define i1 @isnone_f(float %x) nounwind { ; X86-FASTISEL-NEXT: fstp %st(0) ; X86-FASTISEL-NEXT: xorl %eax, %eax ; X86-FASTISEL-NEXT: retl -; -; X64-GISEL-LABEL: isnone_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: xorl %eax, %eax -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 0) ret i1 %0 @@ -50,27 +45,22 @@ define i1 @isany_f(float %x) nounwind { ; X86-FASTISEL-NEXT: fstp %st(0) ; X86-FASTISEL-NEXT: movb $1, %al ; X86-FASTISEL-NEXT: retl -; -; X64-GISEL-LABEL: isany_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movb $1, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1023) ret i1 %0 } define i1 @issignaling_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: issignaling_f: -; X86-SDAGISEL: # %bb.0: -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-SDAGISEL-NEXT: setl %cl -; X86-SDAGISEL-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; X86-SDAGISEL-NEXT: setge %al -; X86-SDAGISEL-NEXT: andb %cl, %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: issignaling_f: +; X86: # %bb.0: +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setl %cl +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %al +; X86-NEXT: andb %cl, %al +; X86-NEXT: retl ; ; X64-LABEL: issignaling_f: ; X64: # %bb.0: @@ -97,44 +87,18 @@ define i1 @issignaling_f(float %x) nounwind { ; X86-FASTISEL-NEXT: andb %cl, %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: issignaling_f: -; X86-GISEL: # %bb.0: -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: seta %dl -; X86-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-GISEL-NEXT: setb %al -; X86-GISEL-NEXT: andb %dl, %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: issignaling_f: -; X64-GISEL: # %bb.0: -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: seta %dl -; X64-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X64-GISEL-NEXT: setb %al -; X64-GISEL-NEXT: andb %dl, %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq %a0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1) ; "snan" ret i1 %a0 } define i1 @isquiet_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: isquiet_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-SDAGISEL-NEXT: setge %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: isquiet_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setge %al +; X86-NEXT: retl ; ; X64-LABEL: isquiet_f: ; X64: # %bb.0: # %entry @@ -155,39 +119,19 @@ define i1 @issignaling_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setge %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: isquiet_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-GISEL-NEXT: setae %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: isquiet_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X64-GISEL-NEXT: setae %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2) ; "qnan" ret i1 %0 } define i1 @not_isquiet_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: not_isquiet_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-SDAGISEL-NEXT: setl %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: not_isquiet_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setl %al +; X86-NEXT: retl ; ; X64-LABEL: not_isquiet_f: ; X64: # %bb.0: # %entry @@ -208,57 +152,19 @@ define i1 @not_isquiet_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setl %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: not_isquiet_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: setb %dl -; X86-GISEL-NEXT: orb %cl, %dl -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: sete %cl -; X86-GISEL-NEXT: orb %dl, %cl -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: seta %dl -; X86-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-GISEL-NEXT: setb %al -; X86-GISEL-NEXT: andb %dl, %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: not_isquiet_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: setb %dl -; X64-GISEL-NEXT: orb %cl, %dl -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: sete %cl -; X64-GISEL-NEXT: orb %dl, %cl -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: seta %dl -; X64-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X64-GISEL-NEXT: setb %al -; X64-GISEL-NEXT: andb %dl, %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021) ; ~"qnan" ret i1 %0 } define i1 @isinf_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: isinf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: sete %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: isinf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: retl ; ; X64-LABEL: isinf_f: ; X64: # %bb.0: # %entry @@ -279,39 +185,19 @@ define i1 @isinf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: sete %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: isinf_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: sete %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: isinf_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: sete %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516) ; 0x204 = "inf" ret i1 %0 } define i1 @not_isinf_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: not_isinf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: setne %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: not_isinf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setne %al +; X86-NEXT: retl ; ; X64-LABEL: not_isinf_f: ; X64: # %bb.0: # %entry @@ -332,43 +218,17 @@ define i1 @not_isinf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setne %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: not_isinf_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: setb %dl -; X86-GISEL-NEXT: orb %cl, %dl -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: seta %al -; X86-GISEL-NEXT: orb %dl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: not_isinf_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: setb %dl -; X64-GISEL-NEXT: orb %cl, %dl -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: seta %al -; X64-GISEL-NEXT: orb %dl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507) ; ~0x204 = "~inf" ret i1 %0 } define i1 @is_plus_inf_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: is_plus_inf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: sete %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: is_plus_inf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: retl ; ; X64-LABEL: is_plus_inf_f: ; X64: # %bb.0: # %entry @@ -386,34 +246,17 @@ define i1 @is_plus_inf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: sete %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: is_plus_inf_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; X86-GISEL-NEXT: sete %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: is_plus_inf_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: sete %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512) ; 0x200 = "+inf" ret i1 %0 } define i1 @is_minus_inf_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: is_minus_inf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 -; X86-SDAGISEL-NEXT: sete %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: is_minus_inf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 +; X86-NEXT: sete %al +; X86-NEXT: retl ; ; X64-LABEL: is_minus_inf_f: ; X64: # %bb.0: # %entry @@ -431,34 +274,17 @@ define i1 @is_minus_inf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: sete %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: is_minus_inf_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 -; X86-GISEL-NEXT: sete %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: is_minus_inf_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; X64-GISEL-NEXT: sete %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4) ; "-inf" ret i1 %0 } define i1 @not_is_minus_inf_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: not_is_minus_inf_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 -; X86-SDAGISEL-NEXT: setne %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: not_is_minus_inf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 +; X86-NEXT: setne %al +; X86-NEXT: retl ; ; X64-LABEL: not_is_minus_inf_f: ; X64: # %bb.0: # %entry @@ -476,55 +302,19 @@ define i1 @not_is_minus_inf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setne %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: not_is_minus_inf_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: pushl %ebx -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: movl %eax, %ecx -; X86-GISEL-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %edx, %edx -; X86-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 -; X86-GISEL-NEXT: setb %bl -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: sete %ah -; X86-GISEL-NEXT: orb %dl, %ah -; X86-GISEL-NEXT: orb %bl, %ah -; X86-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 -; X86-GISEL-NEXT: seta %al -; X86-GISEL-NEXT: orb %ah, %al -; X86-GISEL-NEXT: popl %ebx -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: not_is_minus_inf_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: movl %eax, %ecx -; X64-GISEL-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %edx, %edx -; X64-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 -; X64-GISEL-NEXT: setb %sil -; X64-GISEL-NEXT: orb %dl, %sil -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: sete %dl -; X64-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 -; X64-GISEL-NEXT: seta %al -; X64-GISEL-NEXT: orb %dl, %al -; X64-GISEL-NEXT: orb %sil, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019) ; ~"-inf" ret i1 %0 } define i1 @isfinite_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: isfinite_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: setl %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: isfinite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setl %al +; X86-NEXT: retl ; ; X64-LABEL: isfinite_f: ; X64: # %bb.0: # %entry @@ -545,39 +335,19 @@ define i1 @isfinite_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setl %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: isfinite_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: setb %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: isfinite_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: setb %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504) ; 0x1f8 = "finite" ret i1 %0 } define i1 @not_isfinite_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: not_isfinite_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: setge %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: not_isfinite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setge %al +; X86-NEXT: retl ; ; X64-LABEL: not_isfinite_f: ; X64: # %bb.0: # %entry @@ -598,43 +368,17 @@ define i1 @not_isfinite_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setge %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: not_isfinite_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: sete %dl -; X86-GISEL-NEXT: orb %cl, %dl -; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-GISEL-NEXT: seta %al -; X86-GISEL-NEXT: orb %dl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: not_isfinite_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: sete %dl -; X64-GISEL-NEXT: orb %cl, %dl -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: seta %al -; X64-GISEL-NEXT: orb %dl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519) ; ~0x1f8 = "~finite" ret i1 %0 } define i1 @is_plus_finite_f(float %x) nounwind { -; X86-SDAGISEL-LABEL: is_plus_finite_f: -; X86-SDAGISEL: # %bb.0: # %entry -; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; X86-SDAGISEL-NEXT: setb %al -; X86-SDAGISEL-NEXT: retl +; X86-LABEL: is_plus_finite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: setb %al +; X86-NEXT: retl ; ; X64-LABEL: is_plus_finite_f: ; X64: # %bb.0: # %entry @@ -652,23 +396,6 @@ define i1 @is_plus_finite_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setb %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl -; -; X86-GISEL-LABEL: is_plus_finite_f: -; X86-GISEL: # %bb.0: # %entry -; X86-GISEL-NEXT: xorl %ecx, %ecx -; X86-GISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; X86-GISEL-NEXT: setb %al -; X86-GISEL-NEXT: orb %cl, %al -; X86-GISEL-NEXT: retl -; -; X64-GISEL-LABEL: is_plus_finite_f: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: xorl %ecx, %ecx -; X64-GISEL-NEXT: movd %xmm0, %eax -; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X64-GISEL-NEXT: setb %al -; X64-GISEL-NEXT: orb %cl, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448) ; 0x1c0 = "+finite" ret i1 %0 @@ -691,11 +418,6 @@ define i1 @isnone_d(double %x) nounwind { ; X86-FASTISEL-NEXT: fstp %st(0) ; X86-FASTISEL-NEXT: xorl %eax, %eax ; X86-FASTISEL-NEXT: retl -; -; X64-GISEL-LABEL: isnone_d: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: xorl %eax, %eax -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 0) ret i1 %0 @@ -718,11 +440,6 @@ define i1 @isany_d(double %x) nounwind { ; X86-FASTISEL-NEXT: fstp %st(0) ; X86-FASTISEL-NEXT: movb $1, %al ; X86-FASTISEL-NEXT: retl -; -; X64-GISEL-LABEL: isany_d: -; X64-GISEL: # %bb.0: # %entry -; X64-GISEL-NEXT: movb $1, %al -; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 1023) ret i1 %0 diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index bdb7c30..4ec54d8 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -2071,7 +2071,7 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) { ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,7,42,32] ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vphaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index d752659..04f0a65 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-NOVBMI ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-VBMI ; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 @@ -883,6 +883,30 @@ define <16 x i16> @test_16f32tosb_512(ptr %ptr, <16 x i16> %passthru) "min-legal } define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="256" { +; CHECK-SKX-NOVBMI-LABEL: mul256: +; CHECK-SKX-NOVBMI: # %bb.0: +; CHECK-SKX-NOVBMI-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-SKX-NOVBMI-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm3, %ymm5 +; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 +; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 +; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm1, %ymm1 +; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4) +; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm2, %ymm3 +; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 +; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm0, %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4) +; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-SKX-NOVBMI-NEXT: vzeroupper +; CHECK-SKX-NOVBMI-NEXT: retq +; ; CHECK-SKX-VBMI-LABEL: mul256: ; CHECK-SKX-VBMI: # %bb.0: ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm0 @@ -960,6 +984,21 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" } define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="512" { +; CHECK-SKX-NOVBMI-LABEL: mul512: +; CHECK-SKX-NOVBMI: # %bb.0: +; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rsi), %zmm1 +; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-SKX-NOVBMI-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 +; CHECK-SKX-NOVBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm1 +; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 +; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %zmm0, %zmm0 +; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2) +; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-SKX-NOVBMI-NEXT: vzeroupper +; CHECK-SKX-NOVBMI-NEXT: retq +; ; CHECK-SKX-VBMI-LABEL: mul512: ; CHECK-SKX-VBMI: # %bb.0: ; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 @@ -1137,6 +1176,14 @@ define <16 x i16> @trunc_v16i32_v16i16_zeroes(ptr %x) nounwind "min-legal-vector } define <32 x i8> @trunc_v32i16_v32i8_zeroes(ptr %x) nounwind "min-legal-vector-width"="256" { +; CHECK-SKX-NOVBMI-LABEL: trunc_v32i16_v32i8_zeroes: +; CHECK-SKX-NOVBMI: # %bb.0: +; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, 32(%rdi), %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, (%rdi), %ymm1 +; CHECK-SKX-NOVBMI-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-SKX-NOVBMI-NEXT: retq +; ; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_zeroes: ; CHECK-SKX-VBMI: # %bb.0: ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1 @@ -1192,6 +1239,14 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(ptr %x) nounwind "min-legal-vector-w } define <32 x i8> @trunc_v32i16_v32i8_sign(ptr %x) nounwind "min-legal-vector-width"="256" { +; CHECK-SKX-NOVBMI-LABEL: trunc_v32i16_v32i8_sign: +; CHECK-SKX-NOVBMI: # %bb.0: +; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, 32(%rdi), %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, (%rdi), %ymm1 +; CHECK-SKX-NOVBMI-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; CHECK-SKX-NOVBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-SKX-NOVBMI-NEXT: retq +; ; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_sign: ; CHECK-SKX-VBMI: # %bb.0: ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1 diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll index cc4bda8..650b562 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X86 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X64 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,SSE2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=X64,SSE41 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,AVX1 diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll index 7c1a1e2..874d885 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X86 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X64 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,SSE2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=X64,SSE41 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,AVX1 diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll index 6174011..83a0ddb 100644 --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -5,9 +5,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI @@ -598,6 +598,33 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; +; AVX512F-LABEL: var_shuffle_zero_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 +; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514] +; AVX512F-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: var_shuffle_zero_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpcmpnleuw %zmm2, %zmm1, %k1 +; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514] +; AVX512BW-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; ; AVX512VL-LABEL: var_shuffle_zero_v8i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1 @@ -921,6 +948,28 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; +; AVX512F-LABEL: var_shuffle_zero_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: var_shuffle_zero_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpcmpnleub %zmm2, %zmm1, %k1 +; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; ; AVX512VL-LABEL: var_shuffle_zero_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 9b52857..d16b28a 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -1872,7 +1872,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: paddw %xmm0, %xmm0 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,8192,4096,2048,1024,512,256] @@ -1964,7 +1964,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 -; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2] ; X86-SSE2-NEXT: por %xmm1, %xmm2 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256] diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll index 983ae59..3d85d55 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -851,7 +851,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: paddw %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index d565ef0..1602cde 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1673,7 +1673,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,128,0,16,0,2,0,32,0,64,0,0,0,8,0,4] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2] @@ -1750,7 +1750,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] ; X86-SSE-NEXT: pandn %xmm0, %xmm1 -; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,128,0,16,0,2,0,32,0,64,0,0,0,8,0,4] ; X86-SSE-NEXT: por %xmm1, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2] diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 8cb2c7b..a847da6 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -1223,7 +1223,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1275,7 +1275,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] ; X86-SSE-NEXT: pandn %xmm0, %xmm1 -; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2] ; X86-SSE-NEXT: por %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index 57874c4..eb39b6a 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1480,7 +1480,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1532,7 +1532,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] ; X86-SSE-NEXT: pandn %xmm0, %xmm1 -; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u] ; X86-SSE-NEXT: por %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3> |