diff options
Diffstat (limited to 'llvm/test/CodeGen')
| -rw-r--r-- | llvm/test/CodeGen/AArch64/popcount_vmask.ll | 315 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/vector-minmax.ll | 119 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/vldn_shuffle.ll | 105 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ldexp-fp128.ll | 66 | ||||
| -rw-r--r-- | llvm/test/CodeGen/LoongArch/ldptr.ll | 10 | ||||
| -rw-r--r-- | llvm/test/CodeGen/LoongArch/sink-fold-addi.ll | 88 | ||||
| -rw-r--r-- | llvm/test/CodeGen/LoongArch/stptr.ll | 8 | ||||
| -rw-r--r-- | llvm/test/CodeGen/RISCV/rv64xtheadba.ll | 19 | ||||
| -rw-r--r-- | llvm/test/CodeGen/RISCV/rv64zba.ll | 209 | ||||
| -rw-r--r-- | llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll | 16 | ||||
| -rw-r--r-- | llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir | 24 | ||||
| -rw-r--r-- | llvm/test/CodeGen/RISCV/zicond-opts.ll | 32 | ||||
| -rw-r--r-- | llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll | 36 | ||||
| -rw-r--r-- | llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll | 98 | ||||
| -rw-r--r-- | llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll | 342 | ||||
| -rw-r--r-- | llvm/test/CodeGen/WebAssembly/memory-interleave.ll | 1106 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/pr166744.ll | 66 |
17 files changed, 1839 insertions, 820 deletions
diff --git a/llvm/test/CodeGen/AArch64/popcount_vmask.ll b/llvm/test/CodeGen/AArch64/popcount_vmask.ll new file mode 100644 index 0000000..e784ead --- /dev/null +++ b/llvm/test/CodeGen/AArch64/popcount_vmask.ll @@ -0,0 +1,315 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define i32 @vmask_popcount_i32_v8i8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: vmask_popcount_i32_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.8b, v1.8b, v0.8b +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: saddlv s0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <8 x i8> %a, %b + %t1 = bitcast <8 x i1> %mask to i8 + %t2 = call i8 @llvm.ctpop(i8 %t1) + %t3 = zext i8 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: vmask_popcount_i32_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.16b, v1.16b, v0.16b +; CHECK-NEXT: sshll2 v1.8h, v0.16b, #0 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: saddl2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <16 x i8> %a, %b + %t1 = bitcast <16 x i1> %mask to i16 + %t2 = call i16 @llvm.ctpop(i16 %t1) + %t3 = zext i16 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: vmask_popcount_i32_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h +; CHECK-NEXT: saddlv s0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <4 x i16> %a, %b + %t1 = bitcast <4 x i1> %mask to i4 + %t2 = call i4 @llvm.ctpop(i4 %t1) + %t3 = zext i4 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: vmask_popcount_i32_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.8h, v1.8h, v0.8h +; CHECK-NEXT: saddlv s0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <8 x i16> %a, %b + %t1 = bitcast <8 x i1> %mask to i8 + %t2 = call i8 @llvm.ctpop(i8 %t1) + %t3 = zext i8 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v2i32(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: vmask_popcount_i32_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.2s, v1.2s, v0.2s +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <2 x i32> %a, %b + %t1 = bitcast <2 x i1> %mask to i2 + %t2 = call i2 @llvm.ctpop(i2 %t1) + %t3 = zext i2 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: vmask_popcount_i32_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <4 x i32> %a, %b + %t1 = bitcast <4 x i1> %mask to i4 + %t2 = call i4 @llvm.ctpop(i4 %t1) + %t3 = zext i4 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v1i64(<1 x i64> %a, <1 x i64> %b) { +; CHECK-LABEL: vmask_popcount_i32_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: cmp x9, x8 +; CHECK-NEXT: cset w0, lt +; CHECK-NEXT: ret + %mask = icmp slt <1 x i64> %a, %b + %t1 = bitcast <1 x i1> %mask to i1 + %t2 = call i1 @llvm.ctpop(i1 %t1) + %t3 = zext i1 %t2 to i32 + ret i32 %t3 +} + +define i32 @vmask_popcount_i32_v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: vmask_popcount_i32_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.2d, v1.2d, v0.2d +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <2 x i64> %a, %b + %t1 = bitcast <2 x i1> %mask to i2 + %t2 = call i2 @llvm.ctpop(i2 %t1) + %t3 = zext i2 %t2 to i32 + ret i32 %t3 +} + +define i64 @vmask_popcount_i64_v8i8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: vmask_popcount_i64_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.8b, v1.8b, v0.8b +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: saddlv s0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <8 x i8> %a, %b + %t1 = bitcast <8 x i1> %mask to i8 + %t2 = call i8 @llvm.ctpop(i8 %t1) + %t3 = zext i8 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: vmask_popcount_i64_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.16b, v1.16b, v0.16b +; CHECK-NEXT: sshll2 v1.8h, v0.16b, #0 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: saddl2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <16 x i8> %a, %b + %t1 = bitcast <16 x i1> %mask to i16 + %t2 = call i16 @llvm.ctpop(i16 %t1) + %t3 = zext i16 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: vmask_popcount_i64_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h +; CHECK-NEXT: saddlv s0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <4 x i16> %a, %b + %t1 = bitcast <4 x i1> %mask to i4 + %t2 = call i4 @llvm.ctpop(i4 %t1) + %t3 = zext i4 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: vmask_popcount_i64_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.8h, v1.8h, v0.8h +; CHECK-NEXT: saddlv s0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <8 x i16> %a, %b + %t1 = bitcast <8 x i1> %mask to i8 + %t2 = call i8 @llvm.ctpop(i8 %t1) + %t3 = zext i8 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v2i32(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: vmask_popcount_i64_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.2s, v1.2s, v0.2s +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <2 x i32> %a, %b + %t1 = bitcast <2 x i1> %mask to i2 + %t2 = call i2 @llvm.ctpop(i2 %t1) + %t3 = zext i2 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: vmask_popcount_i64_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <4 x i32> %a, %b + %t1 = bitcast <4 x i1> %mask to i4 + %t2 = call i4 @llvm.ctpop(i4 %t1) + %t3 = zext i4 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v1i64(<1 x i64> %a, <1 x i64> %b) { +; CHECK-LABEL: vmask_popcount_i64_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: cmp x9, x8 +; CHECK-NEXT: cset w0, lt +; CHECK-NEXT: ret + %mask = icmp slt <1 x i64> %a, %b + %t1 = bitcast <1 x i1> %mask to i1 + %t2 = call i1 @llvm.ctpop(i1 %t1) + %t3 = zext i1 %t2 to i64 + ret i64 %t3 +} + +define i64 @vmask_popcount_i64_v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: vmask_popcount_i64_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v0.2d, v1.2d, v0.2d +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: neg w0, w8 +; CHECK-NEXT: ret + %mask = icmp slt <2 x i64> %a, %b + %t1 = bitcast <2 x i1> %mask to i2 + %t2 = call i2 @llvm.ctpop(i2 %t1) + %t3 = zext i2 %t2 to i64 + ret i64 %t3 +} + +define i32 @non_vmask_popcount_1(half %a) { +; CHECK-LABEL: non_vmask_popcount_1: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w8, w8, #0xffff +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: addv b0, v0.8b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %t1 = bitcast half %a to i16 + %t2 = call i16 @llvm.ctpop(i16 %t1) + %t3 = zext i16 %t2 to i32 + ret i32 %t3 +} + +define i32 @non_vmask_popcount_2(<8 x i16> %a) { +; CHECK-LABEL: non_vmask_popcount_2: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: umov w8, v0.b[0] +; CHECK-NEXT: umov w9, v0.b[1] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: and w8, w8, #0x3 +; CHECK-NEXT: bfi w8, w9, #2, #2 +; CHECK-NEXT: umov w9, v0.b[3] +; CHECK-NEXT: bfi w8, w10, #4, #2 +; CHECK-NEXT: umov w10, v0.b[4] +; CHECK-NEXT: bfi w8, w9, #6, #2 +; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: bfi w8, w10, #8, #2 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: bfi w8, w9, #10, #2 +; CHECK-NEXT: umov w9, v0.b[7] +; CHECK-NEXT: bfi w8, w10, #12, #2 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: and w8, w8, #0xffff +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: addv b0, v0.8b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %mask = trunc <8 x i16> %a to <8 x i2> + %t1 = bitcast <8 x i2> %mask to i16 + %t2 = call i16 @llvm.ctpop(i16 %t1) + %t3 = zext i16 %t2 to i32 + ret i32 %t3 +} diff --git a/llvm/test/CodeGen/AArch64/vector-minmax.ll b/llvm/test/CodeGen/AArch64/vector-minmax.ll new file mode 100644 index 0000000..6696f94 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vector-minmax.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s +; RUN: llc < %s -mtriple=aarch64 -mattr=+neon,+sve | FileCheck %s --check-prefix=CHECK-SVE + +define <2 x i64> @smax_v2i64(<2 x i64> %a, <2 x i64> %b){ +; CHECK-LABEL: smax_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmgt v2.2d, v0.2d, v1.2d +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret +; +; CHECK-SVE-LABEL: smax_v2i64: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: ptrue p0.d, vl2 +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-SVE-NEXT: smax z0.d, p0/m, z0.d, z1.d +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-SVE-NEXT: ret +entry: + %0 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %0 +} + +define <2 x i64> @smin_v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: smin_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmgt v2.2d, v1.2d, v0.2d +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret +; +; CHECK-SVE-LABEL: smin_v2i64: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: ptrue p0.d, vl2 +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-SVE-NEXT: smin z0.d, p0/m, z0.d, z1.d +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-SVE-NEXT: ret +entry: + %0 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %0 +} + +define <2 x i64> @umax_v2i64(<2 x i64> %a, <2 x i64> %b){ +; CHECK-LABEL: umax_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmhi v2.2d, v0.2d, v1.2d +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret +; +; CHECK-SVE-LABEL: umax_v2i64: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: ptrue p0.d, vl2 +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-SVE-NEXT: umax z0.d, p0/m, z0.d, z1.d +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-SVE-NEXT: ret +entry: + %0 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %0 +} + +define <2 x i64> @umin_v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: umin_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmhi v2.2d, v1.2d, v0.2d +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret +; +; CHECK-SVE-LABEL: umin_v2i64: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: ptrue p0.d, vl2 +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-SVE-NEXT: umin z0.d, p0/m, z0.d, z1.d +; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-SVE-NEXT: ret +entry: + %0 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %0 +} + +define <1 x i64> @smax_v1i64(<1 x i64> %a, <1 x i64> %b){ +; CHECK-LABEL: smax_v1i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmgt d2, d0, d1 +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret +; +; CHECK-SVE-LABEL: smax_v1i64: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: ptrue p0.d, vl1 +; CHECK-SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-SVE-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-SVE-NEXT: smax z0.d, p0/m, z0.d, z1.d +; CHECK-SVE-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-SVE-NEXT: ret +entry: + %0 = call <1 x i64> @llvm.smax.v2i64(<1 x i64> %a, <1 x i64> %b) + ret <1 x i64> %0 +} + +; This is legal for Neon, so this should use the Neon smax. +define <4 x i32> @smax_v4i32(<4 x i32> %a, <4 x i32> %b){ +; CHECK-LABEL: smax_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +; +; CHECK-SVE-LABEL: smax_v4i32: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-SVE-NEXT: ret +entry: + %0 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %0 +} diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll index b2635d3..3685e9c 100644 --- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -730,111 +730,6 @@ entry: ret void } -define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3, - <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) { -; CHECK-LABEL: store_factor8: -; CHECK: .Lfunc_begin17: -; CHECK-NEXT: .cfi_startproc -; CHECK-NEXT: // %bb.0: -; CHECK: zip1 [[V1:.*s]], [[I1:.*s]], [[I5:.*s]] -; CHECK-NEXT: zip2 [[V5:.*s]], [[I1]], [[I5]] -; CHECK-NEXT: zip1 [[V2:.*s]], [[I2:.*s]], [[I6:.*s]] -; CHECK-NEXT: zip2 [[V6:.*s]], [[I2]], [[I6]] -; CHECK-NEXT: zip1 [[V3:.*s]], [[I3:.*s]], [[I7:.*s]] -; CHECK-NEXT: zip2 [[V7:.*s]], [[I3]], [[I7]] -; CHECK-NEXT: zip1 [[V4:.*s]], [[I4:.*s]], [[I8:.*s]] -; CHECK-NEXT: zip2 [[V8:.*s]], [[I4]], [[I8]] -; CHECK-NEXT: st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64 -; CHECK-NEXT: st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0] -; CHECK-NEXT: ret - - %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - - %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> - %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> - - %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> - store <32 x i32> %interleaved.vec, ptr %ptr, align 4 - ret void -} - -define void @store_factor16(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3, - <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7, - <4 x i32> %a8, <4 x i32> %a9, <4 x i32> %a10, <4 x i32> %a11, - <4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) { -; CHECK-LABEL: store_factor16: -; CHECK: .Lfunc_begin18: -; CHECK-NEXT: .cfi_startproc -; CHECK-NEXT: // %bb.0: -; CHECK: zip1 [[V05:.*s]], [[I05:.*s]], [[I13:.*s]] -; CHECK-NEXT: zip1 [[V01:.*s]], [[I01:.*s]], [[I09:.*s]] -; CHECK-NEXT: zip1 [[V02:.*s]], [[I02:.*s]], [[I10:.*s]] -; CHECK-NEXT: zip1 [[V06:.*s]], [[I06:.*s]], [[I14:.*s]] -; CHECK-NEXT: zip1 [[V07:.*s]], [[I07:.*s]], [[I15:.*s]] -; CHECK-NEXT: zip2 [[V09:.*s]], [[I01]], [[I09]] -; CHECK-NEXT: zip2 [[V13:.*s]], [[I05]], [[I13]] -; CHECK-NEXT: zip1 [[V03:.*s]], [[I03:.*s]], [[I11:.*s]] -; CHECK-NEXT: zip1 [[V04:.*s]], [[I04:.*s]], [[I12:.*s]] -; CHECK-NEXT: zip1 [[V08:.*s]], [[I08:.*s]], [[I16:.*s]] -; CHECK-NEXT: zip2 [[V10:.*s]], [[I02]], [[I10]] -; CHECK-NEXT: zip2 [[V14:.*s]], [[I06]], [[I14]] -; CHECK-NEXT: zip2 [[V11:.*s]], [[I03]], [[I11]] -; CHECK-NEXT: zip1 [[V17:.*s]], [[V01]], [[V05]] -; CHECK-NEXT: zip2 [[V15:.*s]], [[I07]], [[I15]] -; CHECK-NEXT: zip2 [[V21:.*s]], [[V01]], [[V05]] -; CHECK-NEXT: zip1 [[V18:.*s]], [[V02]], [[V06]] -; CHECK-NEXT: zip2 [[V12:.*s]], [[I04]], [[I12]] -; CHECK-NEXT: zip2 [[V16:.*s]], [[I08]], [[I16]] -; CHECK-NEXT: zip1 [[V19:.*s]], [[V03]], [[V07]] -; CHECK-NEXT: zip2 [[V22:.*s]], [[V02]], [[V06]] -; CHECK-NEXT: zip1 [[V25:.*s]], [[V09]], [[V13]] -; CHECK-NEXT: zip1 [[V20:.*s]], [[V04]], [[V08]] -; CHECK-NEXT: zip2 [[V23:.*s]], [[V03]], [[V07]] -; CHECK-NEXT: zip1 [[V26:.*s]], [[V10]], [[V14]] -; CHECK-NEXT: zip2 [[V29:.*s]], [[V09]], [[V13]] -; CHECK-NEXT: zip2 [[V24:.*s]], [[V04]], [[V08]] -; CHECK-NEXT: zip1 [[V27:.*s]], [[V11]], [[V15]] -; CHECK-NEXT: zip2 [[V30:.*s]], [[V10]], [[V14]] -; CHECK-NEXT: zip1 [[V28:.*s]], [[V12]], [[V16]] -; CHECK-NEXT: zip2 [[V31:.*s]], [[V11]], [[V15]] -; CHECK-NEXT: zip2 [[V32:.*s]], [[V12]], [[V16]] -; CHECK-NEXT: st4 { [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64 -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: st4 { [[V21]], [[V22]], [[V23]], [[V24]] }, [x8] -; CHECK-NEXT: add x8, x0, #128 -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st4 { [[V25]], [[V26]], [[V27]], [[V28]] }, [x8] -; CHECK-NEXT: add x8, x0, #192 -; CHECK-NEXT: st4 { [[V29]], [[V30]], [[V31]], [[V32]] }, [x8] -; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload -; CHECK-NEXT: ret - - %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - %v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - %v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - %v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - %v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - - %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> - %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> - %s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> - %s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> - - %d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> - %d1 = shufflevector <16 x i32> %s2, <16 x i32> %s3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> - - %interleaved.vec = shufflevector <32 x i32> %d0, <32 x i32> %d1, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> - store <64 x i32> %interleaved.vec, ptr %ptr, align 4 - ret void -} - declare void @llvm.dbg.value(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} diff --git a/llvm/test/CodeGen/ARM/ldexp-fp128.ll b/llvm/test/CodeGen/ARM/ldexp-fp128.ll new file mode 100644 index 0000000..93fcd39e8 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ldexp-fp128.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=armv7-unknown-linux < %s | FileCheck -check-prefix=LINUX %s + +define fp128 @testExpl(fp128 %val, i32 %a) { +; LINUX-LABEL: testExpl: +; LINUX: @ %bb.0: +; LINUX-NEXT: push {r11, lr} +; LINUX-NEXT: sub sp, sp, #8 +; LINUX-NEXT: ldr r12, [sp, #16] +; LINUX-NEXT: str r12, [sp] +; LINUX-NEXT: bl ldexpl +; LINUX-NEXT: add sp, sp, #8 +; LINUX-NEXT: pop {r11, pc} + %call = tail call fp128 @ldexpl(fp128 %val, i32 %a) + ret fp128 %call +} + +declare fp128 @ldexpl(fp128, i32) memory(none) + +define fp128 @test_ldexp_f128_i32(fp128 %val, i32 %a) { +; LINUX-LABEL: test_ldexp_f128_i32: +; LINUX: @ %bb.0: +; LINUX-NEXT: push {r11, lr} +; LINUX-NEXT: sub sp, sp, #8 +; LINUX-NEXT: ldr r12, [sp, #16] +; LINUX-NEXT: str r12, [sp] +; LINUX-NEXT: bl ldexpl +; LINUX-NEXT: add sp, sp, #8 +; LINUX-NEXT: pop {r11, pc} + %call = tail call fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a) + ret fp128 %call +} + +define <2 x fp128> @test_ldexp_v2f128_v2i32(<2 x fp128> %val, <2 x i32> %a) { +; LINUX-LABEL: test_ldexp_v2f128_v2i32: +; LINUX: @ %bb.0: +; LINUX-NEXT: push {r4, r5, r6, lr} +; LINUX-NEXT: vpush {d8} +; LINUX-NEXT: sub sp, sp, #8 +; LINUX-NEXT: mov r5, r3 +; LINUX-NEXT: add r3, sp, #40 +; LINUX-NEXT: mov r6, r2 +; LINUX-NEXT: mov r4, r0 +; LINUX-NEXT: ldm r3, {r0, r1, r2, r3} +; LINUX-NEXT: vldr d8, [sp, #56] +; LINUX-NEXT: vst1.32 {d8[1]}, [sp:32] +; LINUX-NEXT: bl ldexpl +; LINUX-NEXT: ldr r12, [sp, #32] +; LINUX-NEXT: vst1.32 {d8[0]}, [sp:32] +; LINUX-NEXT: ldr lr, [sp, #36] +; LINUX-NEXT: str r0, [r4, #16] +; LINUX-NEXT: mov r0, r6 +; LINUX-NEXT: str r1, [r4, #20] +; LINUX-NEXT: mov r1, r5 +; LINUX-NEXT: str r2, [r4, #24] +; LINUX-NEXT: mov r2, r12 +; LINUX-NEXT: str r3, [r4, #28] +; LINUX-NEXT: mov r3, lr +; LINUX-NEXT: bl ldexpl +; LINUX-NEXT: stm r4, {r0, r1, r2, r3} +; LINUX-NEXT: add sp, sp, #8 +; LINUX-NEXT: vpop {d8} +; LINUX-NEXT: pop {r4, r5, r6, pc} + %call = tail call <2 x fp128> @llvm.ldexp.v2f128.v2i32(<2 x fp128> %val, <2 x i32> %a) + ret <2 x fp128> %call +} diff --git a/llvm/test/CodeGen/LoongArch/ldptr.ll b/llvm/test/CodeGen/LoongArch/ldptr.ll index c3656a6..9bafa10 100644 --- a/llvm/test/CodeGen/LoongArch/ldptr.ll +++ b/llvm/test/CodeGen/LoongArch/ldptr.ll @@ -24,8 +24,7 @@ define signext i32 @ldptr_w(ptr %p) nounwind { ; LA32-LABEL: ldptr_w: ; LA32: # %bb.0: # %entry ; LA32-NEXT: addi.w $a0, $a0, 2047 -; LA32-NEXT: addi.w $a0, $a0, 1 -; LA32-NEXT: ld.w $a0, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: ldptr_w: @@ -81,10 +80,9 @@ entry: define i64 @ldptr_d(ptr %p) nounwind { ; LA32-LABEL: ldptr_d: ; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $a0, $a0, 2047 -; LA32-NEXT: addi.w $a1, $a0, 1 -; LA32-NEXT: ld.w $a0, $a1, 0 -; LA32-NEXT: ld.w $a1, $a1, 4 +; LA32-NEXT: addi.w $a1, $a0, 2047 +; LA32-NEXT: ld.w $a0, $a1, 1 +; LA32-NEXT: ld.w $a1, $a1, 5 ; LA32-NEXT: ret ; ; LA64-LABEL: ldptr_d: diff --git a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll index 9a806a1..93f73e5 100644 --- a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll +++ b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll @@ -25,14 +25,13 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: move $s1, $a2 ; LA32-NEXT: slli.w $a1, $a0, 4 ; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 -; LA32-NEXT: add.w $a0, $a4, $a0 ; LA32-NEXT: sltui $a1, $a3, 1 ; LA32-NEXT: slti $a2, $a3, 0 ; LA32-NEXT: masknez $a2, $a2, $a1 ; LA32-NEXT: sltui $a3, $s1, 1 ; LA32-NEXT: maskeqz $a1, $a3, $a1 ; LA32-NEXT: or $a1, $a1, $a2 -; LA32-NEXT: addi.w $s2, $a0, 8 +; LA32-NEXT: add.w $s2, $a4, $a0 ; LA32-NEXT: bnez $a1, .LBB0_3 ; LA32-NEXT: # %bb.1: # %for.body.preheader ; LA32-NEXT: move $fp, $a4 @@ -45,8 +44,8 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: # =>This Inner Loop Header: Depth=1 ; LA32-NEXT: move $a0, $fp ; LA32-NEXT: bl f -; LA32-NEXT: ld.w $a0, $s2, 4 -; LA32-NEXT: ld.w $a1, $s2, 0 +; LA32-NEXT: ld.w $a0, $s2, 12 +; LA32-NEXT: ld.w $a1, $s2, 8 ; LA32-NEXT: add.w $a0, $a0, $s6 ; LA32-NEXT: add.w $s3, $a1, $s3 ; LA32-NEXT: sltu $a1, $s3, $a1 @@ -63,8 +62,8 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: move $s3, $zero ; LA32-NEXT: move $s6, $zero ; LA32-NEXT: .LBB0_4: # %for.cond.cleanup -; LA32-NEXT: st.w $s3, $s2, 0 -; LA32-NEXT: st.w $s6, $s2, 4 +; LA32-NEXT: st.w $s3, $s2, 8 +; LA32-NEXT: st.w $s6, $s2, 12 ; LA32-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload ; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload @@ -88,8 +87,7 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: move $s0, $a1 ; LA64-NEXT: slli.d $a1, $a0, 4 ; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 -; LA64-NEXT: add.d $a0, $a2, $a0 -; LA64-NEXT: addi.d $s1, $a0, 8 +; LA64-NEXT: add.d $s1, $a2, $a0 ; LA64-NEXT: blez $s0, .LBB0_3 ; LA64-NEXT: # %bb.1: # %for.body.preheader ; LA64-NEXT: move $fp, $a2 @@ -100,7 +98,7 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: move $a0, $fp ; LA64-NEXT: pcaddu18i $ra, %call36(f) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: ld.d $a0, $s1, 0 +; LA64-NEXT: ld.d $a0, $s1, 8 ; LA64-NEXT: addi.d $s0, $s0, -1 ; LA64-NEXT: add.d $s2, $a0, $s2 ; LA64-NEXT: bnez $s0, .LBB0_2 @@ -108,7 +106,7 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: .LBB0_3: ; LA64-NEXT: move $s2, $zero ; LA64-NEXT: .LBB0_4: # %for.cond.cleanup -; LA64-NEXT: st.d $s2, $s1, 0 +; LA64-NEXT: st.d $s2, $s1, 8 ; LA64-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload @@ -153,14 +151,13 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: move $s1, $a2 ; LA32-NEXT: slli.w $a1, $a0, 4 ; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 -; LA32-NEXT: add.w $a0, $a4, $a0 ; LA32-NEXT: sltui $a1, $a3, 1 ; LA32-NEXT: slti $a2, $a3, 0 ; LA32-NEXT: masknez $a2, $a2, $a1 ; LA32-NEXT: sltui $a3, $s1, 1 ; LA32-NEXT: maskeqz $a1, $a3, $a1 ; LA32-NEXT: or $a1, $a1, $a2 -; LA32-NEXT: addi.w $s2, $a0, 16 +; LA32-NEXT: add.w $s2, $a4, $a0 ; LA32-NEXT: bnez $a1, .LBB1_3 ; LA32-NEXT: # %bb.1: # %for.body.preheader ; LA32-NEXT: move $fp, $a4 @@ -172,7 +169,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: # =>This Inner Loop Header: Depth=1 ; LA32-NEXT: move $a0, $fp ; LA32-NEXT: bl f -; LA32-NEXT: fld.s $fa0, $s2, 0 +; LA32-NEXT: fld.s $fa0, $s2, 16 ; LA32-NEXT: addi.w $s3, $s3, 1 ; LA32-NEXT: sltui $a0, $s3, 1 ; LA32-NEXT: add.w $s4, $s4, $a0 @@ -185,7 +182,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: .LBB1_3: ; LA32-NEXT: movgr2fr.w $fs0, $zero ; LA32-NEXT: .LBB1_4: # %for.cond.cleanup -; LA32-NEXT: fst.s $fs0, $s2, 0 +; LA32-NEXT: fst.s $fs0, $s2, 16 ; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload ; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload @@ -208,8 +205,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: move $s0, $a1 ; LA64-NEXT: slli.d $a1, $a0, 4 ; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 -; LA64-NEXT: add.d $a0, $a2, $a0 -; LA64-NEXT: addi.d $s1, $a0, 16 +; LA64-NEXT: add.d $s1, $a2, $a0 ; LA64-NEXT: blez $s0, .LBB1_3 ; LA64-NEXT: # %bb.1: # %for.body.preheader ; LA64-NEXT: move $fp, $a2 @@ -220,7 +216,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: move $a0, $fp ; LA64-NEXT: pcaddu18i $ra, %call36(f) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: fld.s $fa0, $s1, 0 +; LA64-NEXT: fld.s $fa0, $s1, 16 ; LA64-NEXT: addi.d $s0, $s0, -1 ; LA64-NEXT: fadd.s $fs0, $fa0, $fs0 ; LA64-NEXT: bnez $s0, .LBB1_2 @@ -228,7 +224,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: .LBB1_3: ; LA64-NEXT: movgr2fr.w $fs0, $zero ; LA64-NEXT: .LBB1_4: # %for.cond.cleanup -; LA64-NEXT: fst.s $fs0, $s1, 0 +; LA64-NEXT: fst.s $fs0, $s1, 16 ; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload @@ -271,14 +267,13 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: move $s0, $a3 ; LA32-NEXT: move $s1, $a2 ; LA32-NEXT: slli.w $a0, $a0, 6 -; LA32-NEXT: add.w $a0, $a4, $a0 ; LA32-NEXT: sltui $a1, $a3, 1 ; LA32-NEXT: slti $a2, $a3, 0 ; LA32-NEXT: masknez $a2, $a2, $a1 ; LA32-NEXT: sltui $a3, $s1, 1 ; LA32-NEXT: maskeqz $a1, $a3, $a1 ; LA32-NEXT: or $a1, $a1, $a2 -; LA32-NEXT: addi.w $s2, $a0, 16 +; LA32-NEXT: add.w $s2, $a4, $a0 ; LA32-NEXT: bnez $a1, .LBB2_3 ; LA32-NEXT: # %bb.1: # %for.body.preheader ; LA32-NEXT: move $fp, $a4 @@ -291,7 +286,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill ; LA32-NEXT: move $a0, $fp ; LA32-NEXT: bl f -; LA32-NEXT: vld $vr0, $s2, 0 +; LA32-NEXT: vld $vr0, $s2, 16 ; LA32-NEXT: addi.w $s3, $s3, 1 ; LA32-NEXT: sltui $a0, $s3, 1 ; LA32-NEXT: add.w $s4, $s4, $a0 @@ -307,7 +302,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: .LBB2_3: ; LA32-NEXT: vrepli.b $vr0, 0 ; LA32-NEXT: .LBB2_4: # %for.cond.cleanup -; LA32-NEXT: vst $vr0, $s2, 0 +; LA32-NEXT: vst $vr0, $s2, 16 ; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload ; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload ; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload @@ -326,8 +321,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill ; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill ; LA64-NEXT: slli.d $a0, $a0, 6 -; LA64-NEXT: add.d $a0, $a2, $a0 -; LA64-NEXT: addi.d $s1, $a0, 16 +; LA64-NEXT: add.d $s1, $a2, $a0 ; LA64-NEXT: blez $a1, .LBB2_3 ; LA64-NEXT: # %bb.1: # %for.body.preheader ; LA64-NEXT: move $fp, $a2 @@ -340,7 +334,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: move $a0, $fp ; LA64-NEXT: pcaddu18i $ra, %call36(f) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: vld $vr0, $s1, 0 +; LA64-NEXT: vld $vr0, $s1, 16 ; LA64-NEXT: addi.d $s0, $s0, -1 ; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload ; LA64-NEXT: vadd.w $vr1, $vr0, $vr1 @@ -351,7 +345,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: .LBB2_3: ; LA64-NEXT: vrepli.b $vr0, 0 ; LA64-NEXT: .LBB2_4: # %for.cond.cleanup -; LA64-NEXT: vst $vr0, $s1, 0 +; LA64-NEXT: vst $vr0, $s1, 16 ; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload ; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload @@ -393,14 +387,13 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: move $s0, $a3 ; LA32-NEXT: move $s1, $a2 ; LA32-NEXT: slli.w $a0, $a0, 6 -; LA32-NEXT: add.w $a0, $a4, $a0 ; LA32-NEXT: sltui $a1, $a3, 1 ; LA32-NEXT: slti $a2, $a3, 0 ; LA32-NEXT: masknez $a2, $a2, $a1 ; LA32-NEXT: sltui $a3, $s1, 1 ; LA32-NEXT: maskeqz $a1, $a3, $a1 ; LA32-NEXT: or $a1, $a1, $a2 -; LA32-NEXT: addi.w $s2, $a0, 32 +; LA32-NEXT: add.w $s2, $a4, $a0 ; LA32-NEXT: bnez $a1, .LBB3_3 ; LA32-NEXT: # %bb.1: # %for.body.preheader ; LA32-NEXT: move $fp, $a4 @@ -413,7 +406,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill ; LA32-NEXT: move $a0, $fp ; LA32-NEXT: bl f -; LA32-NEXT: xvld $xr0, $s2, 0 +; LA32-NEXT: xvld $xr0, $s2, 32 ; LA32-NEXT: addi.w $s3, $s3, 1 ; LA32-NEXT: sltui $a0, $s3, 1 ; LA32-NEXT: add.w $s4, $s4, $a0 @@ -429,7 +422,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: .LBB3_3: ; LA32-NEXT: xvrepli.b $xr0, 0 ; LA32-NEXT: .LBB3_4: # %for.cond.cleanup -; LA32-NEXT: xvst $xr0, $s2, 0 +; LA32-NEXT: xvst $xr0, $s2, 32 ; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload ; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload ; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload @@ -448,8 +441,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill ; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill ; LA64-NEXT: slli.d $a0, $a0, 6 -; LA64-NEXT: add.d $a0, $a2, $a0 -; LA64-NEXT: addi.d $s1, $a0, 32 +; LA64-NEXT: add.d $s1, $a2, $a0 ; LA64-NEXT: blez $a1, .LBB3_3 ; LA64-NEXT: # %bb.1: # %for.body.preheader ; LA64-NEXT: move $fp, $a2 @@ -462,7 +454,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: move $a0, $fp ; LA64-NEXT: pcaddu18i $ra, %call36(f) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: xvld $xr0, $s1, 0 +; LA64-NEXT: xvld $xr0, $s1, 32 ; LA64-NEXT: addi.d $s0, $s0, -1 ; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload ; LA64-NEXT: xvadd.h $xr1, $xr0, $xr1 @@ -473,7 +465,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: .LBB3_3: ; LA64-NEXT: xvrepli.b $xr0, 0 ; LA64-NEXT: .LBB3_4: # %for.cond.cleanup -; LA64-NEXT: xvst $xr0, $s1, 0 +; LA64-NEXT: xvst $xr0, $s1, 32 ; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload ; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload @@ -516,14 +508,13 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: move $s1, $a2 ; LA32-NEXT: slli.w $a1, $a0, 4 ; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 -; LA32-NEXT: add.w $a0, $a4, $a0 ; LA32-NEXT: sltui $a1, $a3, 1 ; LA32-NEXT: slti $a2, $a3, 0 ; LA32-NEXT: masknez $a2, $a2, $a1 ; LA32-NEXT: sltui $a3, $s1, 1 ; LA32-NEXT: maskeqz $a1, $a3, $a1 ; LA32-NEXT: or $a1, $a1, $a2 -; LA32-NEXT: addi.w $s2, $a0, 16 +; LA32-NEXT: add.w $s2, $a4, $a0 ; LA32-NEXT: bnez $a1, .LBB4_3 ; LA32-NEXT: # %bb.1: # %for.body.preheader ; LA32-NEXT: move $fp, $a4 @@ -536,7 +527,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill ; LA32-NEXT: move $a0, $fp ; LA32-NEXT: bl f -; LA32-NEXT: vldrepl.b $vr0, $s2, 0 +; LA32-NEXT: vldrepl.b $vr0, $s2, 16 ; LA32-NEXT: addi.w $s3, $s3, 1 ; LA32-NEXT: sltui $a0, $s3, 1 ; LA32-NEXT: add.w $s4, $s4, $a0 @@ -552,7 +543,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: .LBB4_3: ; LA32-NEXT: vrepli.b $vr0, 0 ; LA32-NEXT: .LBB4_4: # %for.cond.cleanup -; LA32-NEXT: vstelm.b $vr0, $s2, 0, 1 +; LA32-NEXT: vstelm.b $vr0, $s2, 16, 1 ; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload ; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload ; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload @@ -573,8 +564,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: move $s0, $a1 ; LA64-NEXT: slli.d $a1, $a0, 4 ; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 -; LA64-NEXT: add.d $a0, $a2, $a0 -; LA64-NEXT: addi.d $s1, $a0, 16 +; LA64-NEXT: add.d $s1, $a2, $a0 ; LA64-NEXT: blez $s0, .LBB4_3 ; LA64-NEXT: # %bb.1: # %for.body.preheader ; LA64-NEXT: move $fp, $a2 @@ -586,7 +576,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: move $a0, $fp ; LA64-NEXT: pcaddu18i $ra, %call36(f) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: vldrepl.b $vr0, $s1, 0 +; LA64-NEXT: vldrepl.b $vr0, $s1, 16 ; LA64-NEXT: addi.d $s0, $s0, -1 ; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload ; LA64-NEXT: vadd.b $vr1, $vr0, $vr1 @@ -597,7 +587,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: .LBB4_3: ; LA64-NEXT: vrepli.b $vr0, 0 ; LA64-NEXT: .LBB4_4: # %for.cond.cleanup -; LA64-NEXT: vstelm.b $vr0, $s1, 0, 1 +; LA64-NEXT: vstelm.b $vr0, $s1, 16, 1 ; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload ; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload @@ -643,14 +633,13 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: move $s1, $a2 ; LA32-NEXT: slli.w $a1, $a0, 4 ; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 -; LA32-NEXT: add.w $a0, $a4, $a0 ; LA32-NEXT: sltui $a1, $a3, 1 ; LA32-NEXT: slti $a2, $a3, 0 ; LA32-NEXT: masknez $a2, $a2, $a1 ; LA32-NEXT: sltui $a3, $s1, 1 ; LA32-NEXT: maskeqz $a1, $a3, $a1 ; LA32-NEXT: or $a1, $a1, $a2 -; LA32-NEXT: addi.w $s2, $a0, 8 +; LA32-NEXT: add.w $s2, $a4, $a0 ; LA32-NEXT: bnez $a1, .LBB5_3 ; LA32-NEXT: # %bb.1: # %for.body.preheader ; LA32-NEXT: move $fp, $a4 @@ -663,7 +652,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill ; LA32-NEXT: move $a0, $fp ; LA32-NEXT: bl f -; LA32-NEXT: xvldrepl.d $xr0, $s2, 0 +; LA32-NEXT: xvldrepl.d $xr0, $s2, 8 ; LA32-NEXT: addi.w $s3, $s3, 1 ; LA32-NEXT: sltui $a0, $s3, 1 ; LA32-NEXT: add.w $s4, $s4, $a0 @@ -679,7 +668,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind { ; LA32-NEXT: .LBB5_3: ; LA32-NEXT: xvrepli.b $xr0, 0 ; LA32-NEXT: .LBB5_4: # %for.cond.cleanup -; LA32-NEXT: xvstelm.d $xr0, $s2, 0, 1 +; LA32-NEXT: xvstelm.d $xr0, $s2, 8, 1 ; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload ; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload ; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload @@ -700,8 +689,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: move $s0, $a1 ; LA64-NEXT: slli.d $a1, $a0, 4 ; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 -; LA64-NEXT: add.d $a0, $a2, $a0 -; LA64-NEXT: addi.d $s1, $a0, 8 +; LA64-NEXT: add.d $s1, $a2, $a0 ; LA64-NEXT: blez $s0, .LBB5_3 ; LA64-NEXT: # %bb.1: # %for.body.preheader ; LA64-NEXT: move $fp, $a2 @@ -713,7 +701,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: move $a0, $fp ; LA64-NEXT: pcaddu18i $ra, %call36(f) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: xvldrepl.d $xr0, $s1, 0 +; LA64-NEXT: xvldrepl.d $xr0, $s1, 8 ; LA64-NEXT: addi.d $s0, $s0, -1 ; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload ; LA64-NEXT: xvfadd.d $xr1, $xr0, $xr1 @@ -724,7 +712,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind { ; LA64-NEXT: .LBB5_3: ; LA64-NEXT: xvrepli.b $xr0, 0 ; LA64-NEXT: .LBB5_4: # %for.cond.cleanup -; LA64-NEXT: xvstelm.d $xr0, $s1, 0, 1 +; LA64-NEXT: xvstelm.d $xr0, $s1, 8, 1 ; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload ; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/LoongArch/stptr.ll b/llvm/test/CodeGen/LoongArch/stptr.ll index d70f9f4..23b433a 100644 --- a/llvm/test/CodeGen/LoongArch/stptr.ll +++ b/llvm/test/CodeGen/LoongArch/stptr.ll @@ -23,8 +23,7 @@ define void @stptr_w(ptr %p, i32 signext %val) nounwind { ; LA32-LABEL: stptr_w: ; LA32: # %bb.0: ; LA32-NEXT: addi.w $a0, $a0, 2047 -; LA32-NEXT: addi.w $a0, $a0, 1 -; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: st.w $a1, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: stptr_w: @@ -77,9 +76,8 @@ define void @stptr_d(ptr %p, i64 %val) nounwind { ; LA32-LABEL: stptr_d: ; LA32: # %bb.0: ; LA32-NEXT: addi.w $a0, $a0, 2047 -; LA32-NEXT: addi.w $a0, $a0, 1 -; LA32-NEXT: st.w $a2, $a0, 4 -; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: st.w $a2, $a0, 5 +; LA32-NEXT: st.w $a1, $a0, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: stptr_d: diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll index 50bd22b..f4964288 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll @@ -205,12 +205,19 @@ define i64 @addmul20(i64 %a, i64 %b) { } define i64 @addmul22(i64 %a, i64 %b) { -; CHECK-LABEL: addmul22: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 22 -; CHECK-NEXT: mul a0, a0, a2 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: addmul22: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 22 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: addmul22: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a2, a0, a0, 2 +; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 1 +; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 1 +; RV64XTHEADBA-NEXT: ret %c = mul i64 %a, 22 %d = add i64 %c, %b ret i64 %d diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index 7fd7626..d4b2288 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -585,6 +585,33 @@ define i64 @addmul12(i64 %a, i64 %b) { ret i64 %d } +define i64 @addmul14(i64 %a, i64 %b) { +; RV64I-LABEL: addmul14: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a2, a0, 1 +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul14: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh1add a2, a0, a0 +; RV64ZBA-NEXT: sh1add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul14: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.h a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 14 + %d = add i64 %c, %b + ret i64 %d +} + define i64 @addmul18(i64 %a, i64 %b) { ; RV64I-LABEL: addmul18: ; RV64I: # %bb.0: @@ -636,12 +663,26 @@ define i64 @addmul20(i64 %a, i64 %b) { } define i64 @addmul22(i64 %a, i64 %b) { -; CHECK-LABEL: addmul22: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 22 -; CHECK-NEXT: mul a0, a0, a2 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: addmul22: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 22 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul22: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh2add a2, a0, a0 +; RV64ZBA-NEXT: sh1add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul22: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.w a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret %c = mul i64 %a, 22 %d = add i64 %c, %b ret i64 %d @@ -672,6 +713,32 @@ define i64 @addmul24(i64 %a, i64 %b) { ret i64 %d } +define i64 @addmul26(i64 %a, i64 %b) { +; RV64I-LABEL: addmul26: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 26 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul26: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh1add a2, a0, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul26: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.h a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.w a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 26 + %d = add i64 %c, %b + ret i64 %d +} + define i64 @addmul36(i64 %a, i64 %b) { ; RV64I-LABEL: addmul36: ; RV64I: # %bb.0: @@ -722,6 +789,58 @@ define i64 @addmul40(i64 %a, i64 %b) { ret i64 %d } +define i64 @addmul38(i64 %a, i64 %b) { +; RV64I-LABEL: addmul38: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 38 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul38: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh3add a2, a0, a0 +; RV64ZBA-NEXT: sh1add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul38: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.d a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 38 + %d = add i64 %c, %b + ret i64 %d +} + +define i64 @addmul42(i64 %a, i64 %b) { +; RV64I-LABEL: addmul42: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 42 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul42: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh2add a2, a0, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul42: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.w a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.w a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 42 + %d = add i64 %c, %b + ret i64 %d +} + define i64 @addmul72(i64 %a, i64 %b) { ; RV64I-LABEL: addmul72: ; RV64I: # %bb.0: @@ -747,6 +866,84 @@ define i64 @addmul72(i64 %a, i64 %b) { ret i64 %d } +define i64 @addmul74(i64 %a, i64 %b) { +; RV64I-LABEL: addmul74: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 74 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul74: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh3add a2, a0, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul74: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.d a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.w a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 74 + %d = add i64 %c, %b + ret i64 %d +} + +define i64 @addmul82(i64 %a, i64 %b) { +; RV64I-LABEL: addmul82: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 82 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul82: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh2add a2, a0, a0 +; RV64ZBA-NEXT: sh3add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul82: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.w a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.d a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 82 + %d = add i64 %c, %b + ret i64 %d +} + +define i64 @addmul146(i64 %a, i64 %b) { +; RV64I-LABEL: addmul146: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 146 +; RV64I-NEXT: mul a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: addmul146: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh3add a2, a0, a0 +; RV64ZBA-NEXT: sh3add a0, a2, a0 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret +; +; RV64XANDESPERF-LABEL: addmul146: +; RV64XANDESPERF: # %bb.0: +; RV64XANDESPERF-NEXT: nds.lea.d a2, a0, a0 +; RV64XANDESPERF-NEXT: nds.lea.d a0, a0, a2 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0 +; RV64XANDESPERF-NEXT: ret + %c = mul i64 %a, 146 + %d = add i64 %c, %b + ret i64 %d +} + define i64 @mul50(i64 %a) { ; RV64I-LABEL: mul50: ; RV64I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll index 20034b6..b6e29cf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll @@ -863,3 +863,19 @@ entry: i64 2) ret <vscale x 1 x double> %2 } + +; The two vsetvlis will be coalesced so the add will be made dead and +; removed. Make sure we shrink the live interval of %x. +define void @non_li_addi(i64 %x, ptr %p) { +; CHECK-LABEL: non_li_addi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: ret +entry: + %add = add i64 %x, 1 + %0 = tail call i64 @llvm.riscv.vsetvli(i64 %add, i64 3, i64 0) + %1 = call <vscale x 8 x i8> @llvm.riscv.vle(<vscale x 8 x i8> poison, ptr %p, i64 %0) + %2 = tail call i64 @llvm.riscv.vsetvli(i64 1, i64 3, i64 0) + %3 = tail call { <vscale x 8 x i8>, i64 } @llvm.riscv.vleff(<vscale x 8 x i8> poison, ptr %p, i64 %2) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir index fdd30c9..f9929c9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir @@ -104,6 +104,10 @@ ret void } + define void @non_li_addi() { + ret void + } + declare <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, i64) #1 declare <vscale x 1 x i64> @llvm.riscv.vle.nxv1i64.i64(<vscale x 1 x i64>, ptr nocapture, i64) #4 @@ -664,3 +668,23 @@ body: | bb.2: $x10 = COPY %vl PseudoRET implicit killed $x10 +... +--- +# The two vsetvlis will be coalesced so the ADDI will be made dead and removed. +# Make sure we shrink the live interval of %0. +name: non_li_addi +tracksRegLiveness: true +body: | + bb.0: + liveins: $x10 + ; CHECK-LABEL: name: non_li_addi + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; CHECK-NEXT: dead [[PseudoVSETIVLI:%[0-9]+]]:gprnox0 = PseudoVSETIVLI 1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: PseudoRET + %0:gpr = COPY $x10 + %1:gprnox0 = ADDI %0, 1 + %2:gprnox0 = PseudoVSETVLI %1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype + %3:gprnox0 = PseudoVSETIVLI 1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype + PseudoRET diff --git a/llvm/test/CodeGen/RISCV/zicond-opts.ll b/llvm/test/CodeGen/RISCV/zicond-opts.ll index d8e2b2c..305ab93 100644 --- a/llvm/test/CodeGen/RISCV/zicond-opts.ll +++ b/llvm/test/CodeGen/RISCV/zicond-opts.ll @@ -263,3 +263,35 @@ define i64 @test_inv_and_eqz(i64 %f, i64 %x, i1 %cond) { %7 = and i64 %6, %f ret i64 %7 } + +define i32 @pr166596(i32 %conv.i, i1 %iszero) #0 { +; RV32ZICOND-LABEL: pr166596: +; RV32ZICOND: # %bb.0: # %entry +; RV32ZICOND-NEXT: andi a1, a1, 1 +; RV32ZICOND-NEXT: xori a0, a0, 1 +; RV32ZICOND-NEXT: zext.h a0, a0 +; RV32ZICOND-NEXT: clz a0, a0 +; RV32ZICOND-NEXT: addi a0, a0, 41 +; RV32ZICOND-NEXT: czero.nez a0, a0, a1 +; RV32ZICOND-NEXT: addi a0, a0, -9 +; RV32ZICOND-NEXT: ret +; +; RV64ZICOND-LABEL: pr166596: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: andi a1, a1, 1 +; RV64ZICOND-NEXT: xori a0, a0, 1 +; RV64ZICOND-NEXT: zext.h a0, a0 +; RV64ZICOND-NEXT: clz a0, a0 +; RV64ZICOND-NEXT: addi a0, a0, 9 +; RV64ZICOND-NEXT: czero.nez a0, a0, a1 +; RV64ZICOND-NEXT: addi a0, a0, -9 +; RV64ZICOND-NEXT: ret +entry: + %not.i = xor i32 %conv.i, 1 + %conv2.i = trunc i32 %not.i to i16 + %conv22 = zext i16 %conv2.i to i64 + %0 = call i64 @llvm.ctlz.i64(i64 %conv22, i1 false) + %cast = trunc i64 %0 to i32 + %clzg = select i1 %iszero, i32 -9, i32 %cast + ret i32 %clzg +} diff --git a/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll b/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll new file mode 100644 index 0000000..677291a --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll @@ -0,0 +1,36 @@ +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck -check-prefix=CHECK-ERROR %s +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics %s -o %t.spvt 2>&1 | FileCheck -check-prefix=CHECK-ERROR %s +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=notllvm %s -o %t.spvt 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm.some.custom %s -o %t.spvt 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm. %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm.,random.prefix %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm. %s -o - -filetype=obj | spirv-val %} +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %} + +; The test checks command-line option which allows to represent unknown +; intrinsics as external function calls in SPIR-V. + +; CHECK-ERROR: LLVM ERROR: unable to legalize instruction: %3:iid(s64) = G_READCYCLECOUNTER (in function: foo) + +; CHECK: Name %[[READCYCLECOUNTER:[0-9]+]] "spirv.llvm_readcyclecounter" +; CHECK: Name %[[SOME_CUSTOM_INTRINSIC:[0-9]+]] "spirv.llvm_some_custom_intrinsic" +; CHECK-DAG: Decorate %[[READCYCLECOUNTER]] LinkageAttributes {{.*}} Import +; CHECK: Decorate %[[SOME_CUSTOM_INTRINSIC]] LinkageAttributes {{.*}} Import +; CHECK-DAG: %[[I64:[0-9]+]] = OpTypeInt 64 +; CHECK: %[[FnTy:[0-9]+]] = OpTypeFunction %[[I64]] +; CHECK: %[[READCYCLECOUNTER]] = OpFunction %[[I64]] {{.*}} %[[FnTy]] +; CHECK-DAG: %[[SOME_CUSTOM_INTRINSIC]] = OpFunction %[[I64]] {{.*}} %[[FnTy]] +; CHECK-DAG: OpFunctionCall %[[I64]] %[[READCYCLECOUNTER]] +; CHECK: OpFunctionCall %[[I64]] %[[SOME_CUSTOM_INTRINSIC]] + +define spir_func void @foo() { +entry: +; TODO: if and when the SPIR-V learns how to lower readcyclecounter, we will have to pick another unhandled intrinsic + %0 = call i64 @llvm.readcyclecounter() + %1 = call i64 @llvm.some.custom.intrinsic() + ret void +} + +declare i64 @llvm.readcyclecounter() +declare i64 @llvm.some.custom.intrinsic() diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll new file mode 100644 index 0000000..f6b6115 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll @@ -0,0 +1,98 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_ALTERA_blocking_pipes %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_blocking_pipes %s -o - -filetype=obj | spirv-val %} + +%opencl.pipe_ro_t = type opaque +%opencl.pipe_wo_t = type opaque + +; CHECK-SPIRV: OpCapability BlockingPipesALTERA +; CHECK-SPIRV: OpExtension "SPV_ALTERA_blocking_pipes" +; CHECK-SPIRV: %[[PipeRTy:[0-9]+]] = OpTypePipe ReadOnly +; CHECK-SPIRV: %[[PipeWTy:[0-9]+]] = OpTypePipe WriteOnly +; CHECK-SPIRV: %[[PipeR1:[0-9]+]] = OpLoad %[[PipeRTy]] %[[#]] Aligned 8 +; CHECK-SPIRV: OpReadPipeBlockingALTERA %[[PipeR1]] %[[#]] %[[#]] %[[#]] +; CHECK-SPIRV: %[[PipeR2:[0-9]+]] = OpLoad %[[PipeRTy]] %[[#]] Aligned 8 +; CHECK-SPIRV: OpReadPipeBlockingALTERA %[[PipeR2]] %[[#]] %[[#]] %[[#]] +; CHECK-SPIRV: %[[PipeW1:[0-9]+]] = OpLoad %[[PipeWTy]] %[[#]] Aligned 8 +; CHECK-SPIRV: OpWritePipeBlockingALTERA %[[PipeW1]] %[[#]] %[[#]] %[[#]] +; CHECK-SPIRV: %[[PipeW2:[0-9]+]] = OpLoad %[[PipeWTy]] %[[#]] Aligned 8 +; CHECK-SPIRV: OpWritePipeBlockingALTERA %[[PipeW2]] %[[#]] %[[#]] %[[#]] + +define spir_func void @foo(target("spirv.Pipe", 0) %p, ptr addrspace(1) %ptr) { +entry: + %p.addr = alloca target("spirv.Pipe", 0), align 8 + %ptr.addr = alloca ptr addrspace(1), align 8 + store target("spirv.Pipe", 0) %p, target("spirv.Pipe", 0)* %p.addr, align 8 + store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8 + %0 = load target("spirv.Pipe", 0), target("spirv.Pipe", 0)* %p.addr, align 8 + %1 = load ptr addrspace(1), ptr %ptr.addr, align 8 + %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4) + call spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePiii(target("spirv.Pipe", 0) %0, ptr addrspace(4) %2, i32 4, i32 4) + ret void +} + +declare dso_local spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePiii(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32) + +define spir_func void @bar(target("spirv.Pipe", 0) %p, ptr addrspace(1) %ptr) { +entry: + %p.addr = alloca target("spirv.Pipe", 0), align 8 + %ptr.addr = alloca ptr addrspace(1), align 8 + store target("spirv.Pipe", 0) %p, target("spirv.Pipe", 0)* %p.addr, align 8 + store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8 + %0 = load target("spirv.Pipe", 0), target("spirv.Pipe", 0)* %p.addr, align 8 + %1 = load ptr addrspace(1), ptr %ptr.addr, align 8 + %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4) + call spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePvii(target("spirv.Pipe", 0) %0, ptr addrspace(4) %2, i32 4, i32 4) + ret void +} + +declare dso_local spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePvii(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32) + +define spir_func void @boo(target("spirv.Pipe", 1) %p, ptr addrspace(1) %ptr) { +entry: + %p.addr = alloca target("spirv.Pipe", 1), align 8 + %ptr.addr = alloca ptr addrspace(1), align 8 + store target("spirv.Pipe", 1) %p, target("spirv.Pipe", 1)* %p.addr, align 8 + store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8 + %0 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1)* %p.addr, align 8 + %1 = load ptr addrspace(1), ptr %ptr.addr, align 8 + %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4) + call spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePiii(target("spirv.Pipe", 1) %0, ptr addrspace(4) %2, i32 4, i32 4) + ret void +} + +declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePiii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32) + +define spir_func void @baz(target("spirv.Pipe", 1) %p, ptr addrspace(1) %ptr) { +entry: + %p.addr = alloca target("spirv.Pipe", 1), align 8 + %ptr.addr = alloca ptr addrspace(1), align 8 + store target("spirv.Pipe", 1) %p, target("spirv.Pipe", 1)* %p.addr, align 8 + store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8 + %0 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1)* %p.addr, align 8 + %1 = load ptr addrspace(1), ptr %ptr.addr, align 8 + %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4) + call spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePvii(target("spirv.Pipe", 1) %0, ptr addrspace(4) %2, i32 4, i32 4) + ret void +} + +declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePvii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32) + +; CHECK-LLVM: declare spir_func void @__read_pipe_2_bl(ptr addrspace(1), ptr addrspace(4), i32, i32) +; CHECK-LLVM: declare spir_func void @__write_pipe_2_bl(ptr addrspace(1), ptr addrspace(4), i32, i32) + +define linkonce_odr dso_local spir_func void @WritePipeBLockingi9Pointer(ptr addrspace(4) align 2 dereferenceable(2) %_Data) { +entry: + %_Data.addr = alloca ptr addrspace(4), align 8 + %_WPipe = alloca target("spirv.Pipe", 1), align 8 + %_Data.addr.ascast = addrspacecast ptr %_Data.addr to ptr addrspace(4) + %_WPipe.ascast = addrspacecast target("spirv.Pipe", 1)* %_WPipe to target("spirv.Pipe", 1) addrspace(4)* + store ptr addrspace(4) %_Data, ptr addrspace(4) %_Data.addr.ascast, align 8 + %0 = bitcast target("spirv.Pipe", 1)* %_WPipe to ptr + %1 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1) addrspace(4)* %_WPipe.ascast, align 8 + %2 = load ptr addrspace(4), ptr addrspace(4) %_Data.addr.ascast, align 8 + call spir_func void @_Z30__spirv_WritePipeBlockingINTELIDU9_Ev8ocl_pipePKT_ii(target("spirv.Pipe", 1) %1, ptr addrspace(4) %2, i32 2, i32 2) + ret void +} + +declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIDU9_Ev8ocl_pipePKT_ii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32) +
\ No newline at end of file diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll index a8d37be..c44b3bb 100644 --- a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll @@ -2808,6 +2808,348 @@ entry: ret <4 x i32> %spec.store.select7 } +define <2 x i8> @fptosi_v2f32_v2i8(<2 x float> %x) { +; CHECK-LABEL: fptosi_v2f32_v2i8: +; CHECK: .functype fptosi_v2f32_v2i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <2 x float> %x to <2 x i8> + ret <2 x i8> %conv +} + +define <2 x i8> @fptoui_v2f32_v2i8(<2 x float> %x) { +; CHECK-LABEL: fptoui_v2f32_v2i8: +; CHECK: .functype fptoui_v2f32_v2i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <2 x float> %x to <2 x i8> + ret <2 x i8> %conv +} + +define <2 x i16> @fptosi_v2f32_v2i16(<2 x float> %x) { +; CHECK-LABEL: fptosi_v2f32_v2i16: +; CHECK: .functype fptosi_v2f32_v2i16 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <2 x float> %x to <2 x i16> + ret <2 x i16> %conv +} + +define <2 x i16> @fptoui_v2f32_v2i16(<2 x float> %x) { +; CHECK-LABEL: fptoui_v2f32_v2i16: +; CHECK: .functype fptoui_v2f32_v2i16 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <2 x float> %x to <2 x i16> + ret <2 x i16> %conv +} + +define <4 x i8> @fptosi_v4f32_v4i8(<4 x float> %x) { +; CHECK-LABEL: fptosi_v4f32_v4i8: +; CHECK: .functype fptosi_v4f32_v4i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <4 x float> %x to <4 x i8> + ret <4 x i8> %conv +} + +define <4 x i8> @fptoui_v4f32_v4i8(<4 x float> %x) { +; CHECK-LABEL: fptoui_v4f32_v4i8: +; CHECK: .functype fptoui_v4f32_v4i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <4 x float> %x to <4 x i8> + ret <4 x i8> %conv +} + +define <4 x i16> @fptosi_v4f32_v4i16(<4 x float> %x) { +; CHECK-LABEL: fptosi_v4f32_v4i16: +; CHECK: .functype fptosi_v4f32_v4i16 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <4 x float> %x to <4 x i16> + ret <4 x i16> %conv +} + +define <4 x i16> @fptoui_v4f32_v4i16(<4 x float> %x) { +; CHECK-LABEL: fptoui_v4f32_v4i16: +; CHECK: .functype fptoui_v4f32_v4i16 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <4 x float> %x to <4 x i16> + ret <4 x i16> %conv +} + +define <8 x i8> @fptosi_v8f32_v8i8(<8 x float> %x) { +; CHECK-LABEL: fptosi_v8f32_v8i8: +; CHECK: .functype fptosi_v8f32_v8i8 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <8 x float> %x to <8 x i8> + ret <8 x i8> %conv +} + +define <8 x i8> @fptoui_v8f32_v8i8(<8 x float> %x) { +; CHECK-LABEL: fptoui_v8f32_v8i8: +; CHECK: .functype fptoui_v8f32_v8i8 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <8 x float> %x to <8 x i8> + ret <8 x i8> %conv +} + +define <8 x i16> @fptosi_v8f32_v8i16(<8 x float> %x) { +; CHECK-LABEL: fptosi_v8f32_v8i16: +; CHECK: .functype fptosi_v8f32_v8i16 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <8 x float> %x to <8 x i16> + ret <8 x i16> %conv +} + +define <8 x i16> @fptoui_v8f32_v8i16(<8 x float> %x) { +; CHECK-LABEL: fptoui_v8f32_v8i16: +; CHECK: .functype fptoui_v8f32_v8i16 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <8 x float> %x to <8 x i16> + ret <8 x i16> %conv +} + +define <16 x i8> @fptosi_v16f32_v16i8(<16 x float> %x) { +; CHECK-LABEL: fptosi_v16f32_v16i8: +; CHECK: .functype fptosi_v16f32_v16i8 (v128, v128, v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <16 x float> %x to <16 x i8> + ret <16 x i8> %conv +} + +define <16 x i8> @fptoui_v16f32_v16i8(<16 x float> %x) { +; CHECK-LABEL: fptoui_v16f32_v16i8: +; CHECK: .functype fptoui_v16f32_v16i8 (v128, v128, v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 255, 255, 255, 255 +; CHECK-NEXT: local.tee 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <16 x float> %x to <16 x i8> + ret <16 x i8> %conv +} + +define <16 x i16> @fptosi_v16f32_v16i16(<16 x float> %x) { +; CHECK-LABEL: fptosi_v16f32_v16i16: +; CHECK: .functype fptosi_v16f32_v16i16 (i32, v128, v128, v128, v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: local.tee 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: # fallthrough-return + %conv = fptosi <16 x float> %x to <16 x i16> + ret <16 x i16> %conv +} + +define <16 x i16> @fptoui_v16f32_v16i16(<16 x float> %x) { +; CHECK-LABEL: fptoui_v16f32_v16i16: +; CHECK: .functype fptoui_v16f32_v16i16 (i32, v128, v128, v128, v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535 +; CHECK-NEXT: local.tee 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: # fallthrough-return + %conv = fptoui <16 x float> %x to <16 x i16> + ret <16 x i16> %conv +} + declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>) declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>) declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>) diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll index 5eb49fd..404db23 100644 --- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll +++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s +; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" @@ -20,17 +20,17 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20 ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: i32x4.add -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 ; CHECK: i32x4.add -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 ; CHECK: v128.store define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -64,17 +64,17 @@ define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: i32x4.add -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 ; CHECK: i32x4.sub -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 ; CHECK: v128.store define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -208,27 +208,27 @@ define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i16x8.sub -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i16x8.sub -; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i16x8.sub -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i16x8.sub -; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -276,27 +276,27 @@ define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.or -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.or -; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.xor -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.xor -; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -343,27 +343,27 @@ define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly ; CHECK-LABEL: four_shorts_interleave_op: ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.or -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.xor -; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.or -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: v128.xor -; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -483,19 +483,19 @@ define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 ; CHECK: i16x8.extmul_high_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 ; CHECK: i16x8.extmul_high_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 +; CHECK: i8x16.shuffle 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 ; CHECK: v128.store ; CHECK: i16x8.extmul_low_i8x16_u ; CHECK: i16x8.extmul_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 +; CHECK: i8x16.shuffle 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 ; CHECK: v128.store define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -529,18 +529,18 @@ define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0 ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 ; CHECK: i16x8.extmul_high_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 ; CHECK: i8x16.sub -; CHECK: i8x16.shuffle {{.*}} 0, 24, 2, 25, 4, 26, 6, 27, 8, 28, 10, 29, 12, 30, 14, 31 +; CHECK: i8x16.shuffle 0, 24, 2, 25, 4, 26, 6, 27, 8, 28, 10, 29, 12, 30, 14, 31 ; CHECK: v128.store ; CHECK: i16x8.extmul_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 17, 4, 18, 6, 19, 8, 20, 10, 21, 12, 22, 14, 23 +; CHECK: i8x16.shuffle 0, 16, 2, 17, 4, 18, 6, 19, 8, 20, 10, 21, 12, 22, 14, 23 ; CHECK: v128.store define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -672,27 +672,27 @@ define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writ ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.and -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.and -; CHECK: i8x16.shuffle {{.*}} 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.and -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.and -; CHECK: i8x16.shuffle {{.*}} 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 +; CHECK: i8x16.shuffle 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23 +; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 +; CHECK: i8x16.shuffle 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0 +; CHECK: i8x16.shuffle 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19 +; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 ; CHECK: v128.store define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -740,25 +740,25 @@ define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly % ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}}, 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}}, 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extmul_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}}, 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}}, 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extmul_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}}, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 -; CHECK: i8x16.shuffle {{.*}}, 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}}, 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.sub -; CHECK: i8x16.shuffle {{.*}}, 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}}, 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.sub -; CHECK: i8x16.shuffle {{.*}}, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}}, 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}}, 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 +; CHECK: i8x16.shuffle 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 ; CHECK: v128.store define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -806,27 +806,27 @@ define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.add -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.sub -; CHECK: i8x16.shuffle {{.*}} 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.add -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.sub -; CHECK: i8x16.shuffle {{.*}} 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 +; CHECK: i8x16.shuffle 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23 +; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 +; CHECK: i8x16.shuffle 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0 +; CHECK: i8x16.shuffle 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19 +; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31 ; CHECK: v128.store define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -1272,45 +1272,45 @@ define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noun ; CHECK-LABEL: four_bytes_into_four_ints_vary_op: ; CHECK: loop ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i32x4.extend_low_i16x8_u ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i32x4.extend_low_i16x8_u ; CHECK: i32x4.add -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i32x4.extend_low_i16x8_u -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i32x4.extend_low_i16x8_u ; CHECK: i32x4.sub -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i32x4.extmul_low_i16x8_u ; CHECK: v128.and -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i32x4.extend_low_i16x8_u -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { %5 = icmp eq i32 %3, 0 @@ -1365,7 +1365,7 @@ define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noun ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 +; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 ; CHECK: v128.store define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) { %5 = icmp sgt i32 %3, 0 @@ -1396,35 +1396,35 @@ define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 no ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i16x8.add ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i16x8.add -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i16x8.add ; CHECK: i16x8.add ; CHECK: i16x8.shr_u -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i16x8.add -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i16x8.add -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_u ; CHECK: i16x8.add ; CHECK: i16x8.add ; CHECK: i16x8.shr_u -; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 +; CHECK: i8x16.shuffle 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 ; CHECK: v128.store define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) { %5 = icmp sgt i32 %3, 0 @@ -1492,13 +1492,13 @@ define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i3 ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.avgr_u -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i8x16.avgr_u -; CHECK: i8x16.shuffle {{.*}} 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 +; CHECK: i8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 ; CHECK: v128.store define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) { %5 = icmp sgt i32 %3, 0 @@ -1605,28 +1605,28 @@ for.body: ; preds = %entry, %for.body ; CHECK-LABEL: two_bytes_two_floats_same_op: ; CHECK: loop ; CHECK: v128.load64_zero -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load64_zero -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 ; CHECK: v128.store define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1663,28 +1663,28 @@ for.body: ; preds = %entry, %for.body ; CHECK-LABEL: two_bytes_two_floats_vary_op: ; CHECK: v128.load64_zero -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load64_zero -; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.add -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.sub -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 ; CHECK: v128.store define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1723,38 +1723,24 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.const 255, 255, 255, 255 +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.narrow_i16x8_u +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.narrow_i16x8_u +; CHECK: i8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.store64_lane define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1791,38 +1777,24 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: f32x4.add -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.const 255, 255, 255, 255 +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.narrow_i16x8_u +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 ; CHECK: f32x4.sub -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.narrow_i16x8_u +; CHECK: i8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: v128.store64_lane define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1858,24 +1830,24 @@ for.body: ; preds = %entry, %for.body ; CHECK-LABEL: two_shorts_two_floats_same_op: ; CHECK: loop ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 ; CHECK: v128.store define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1913,24 +1885,24 @@ for.body: ; preds = %entry, %for.body ; CHECK-LABEL: two_shorts_two_floats_vary_op: ; CHECK: loop ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.add -; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.sub -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 ; CHECK: v128.store define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -1969,38 +1941,22 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.const 65535, 65535, 65535, 65535 +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 ; CHECK: v128.store define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2037,38 +1993,22 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 ; CHECK: f32x4.add -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.const 65535, 65535, 65535, 65535 +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 ; CHECK: f32x4.sub -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 ; CHECK: v128.store define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2195,58 +2135,58 @@ for.body: ; preds = %entry, %for.body ; CHECK-LABEL: four_bytes_four_floats_same_op: ; CHECK: loop ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2302,58 +2242,58 @@ for.body: ; preds = %entry, %for.body ; CHECK-LABEL: four_bytes_four_floats_vary_op: ; CHECK: loop ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.add -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.div -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK: i16x8.extend_low_i8x16_s ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.sub -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2410,88 +2350,60 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.const 255, 255, 255, 255 +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.narrow_i16x8_u +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.narrow_i16x8_u +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.narrow_i16x8_u +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.narrow_i16x8_u +; CHECK: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31 ; CHECK: v128.store define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2544,88 +2456,60 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.const 255, 255, 255, 255 +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.narrow_i16x8_u +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.add -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.narrow_i16x8_u +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.div -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.narrow_i16x8_u +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.sub -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i8x16.replace_lane +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.narrow_i16x8_u +; CHECK: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31 ; CHECK: v128.store define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2678,51 +2562,51 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2779,47 +2663,47 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.mul -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.add -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.div -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s -; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 ; CHECK: i32x4.extend_low_i16x8_s ; CHECK: f32x4.convert_i32x4_s ; CHECK: f32x4.sub -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.store define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -2876,89 +2760,58 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.const 65535, 65535, 65535, 65535 +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31 ; CHECK: v128.store -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.splat -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27 ; CHECK: v128.store define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: @@ -3011,89 +2864,58 @@ for.body: ; preds = %entry, %for.body ; CHECK: loop ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK: v128.load ; CHECK: v128.load -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.mul -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.splat -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.const 65535, 65535, 65535, 65535 +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.add -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.div -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 ; CHECK: f32x4.sub -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane +; CHECK: i32x4.trunc_sat_f32x4_s +; CHECK: v128.and +; CHECK: i16x8.narrow_i32x4_u +; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 +; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31 ; CHECK: v128.store -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.splat -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane -; CHECK: f32x4.extract_lane -; CHECK: i32.trunc_sat_f32_s -; CHECK: i16x8.replace_lane +; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27 ; CHECK: v128.store define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll new file mode 100644 index 0000000..21b25d8 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr166744.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=POSTRA +; RUN: llc < %s -mtriple=x86_64-- -mcpu=haswell | FileCheck %s --check-prefixes=NOPOSTRA +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=NOPOSTRA + +; Ensure reloads are after narrowed i512 -> i32 store +define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) { +; POSTRA-LABEL: PR166744: +; POSTRA: # %bb.0: +; POSTRA-NEXT: movl $1029, %eax # imm = 0x405 +; POSTRA-NEXT: shlxl %esi, %edx, %edx +; POSTRA-NEXT: bextrl %eax, %esi, %eax +; POSTRA-NEXT: movl (%rdi,%rax,4), %ecx +; POSTRA-NEXT: btrl %esi, %ecx +; POSTRA-NEXT: orl %ecx, %edx +; POSTRA-NEXT: movl %edx, (%rdi,%rax,4) +; POSTRA-NEXT: movq 16(%rdi), %rax +; POSTRA-NEXT: movq (%rdi), %rcx +; POSTRA-NEXT: movq 24(%rdi), %rdx +; POSTRA-NEXT: movq 8(%rdi), %rsi +; POSTRA-NEXT: orq 56(%rdi), %rdx +; POSTRA-NEXT: orq 40(%rdi), %rsi +; POSTRA-NEXT: orq 48(%rdi), %rax +; POSTRA-NEXT: orq 32(%rdi), %rcx +; POSTRA-NEXT: orq %rdx, %rsi +; POSTRA-NEXT: orq %rax, %rcx +; POSTRA-NEXT: orq %rsi, %rcx +; POSTRA-NEXT: setne %al +; POSTRA-NEXT: retq +; +; NOPOSTRA-LABEL: PR166744: +; NOPOSTRA: # %bb.0: +; NOPOSTRA-NEXT: movl %esi, %eax +; NOPOSTRA-NEXT: shrl $3, %eax +; NOPOSTRA-NEXT: andl $60, %eax +; NOPOSTRA-NEXT: movl (%rdi,%rax), %ecx +; NOPOSTRA-NEXT: btrl %esi, %ecx +; NOPOSTRA-NEXT: shlxl %esi, %edx, %edx +; NOPOSTRA-NEXT: orl %ecx, %edx +; NOPOSTRA-NEXT: movl %edx, (%rdi,%rax) +; NOPOSTRA-NEXT: movq 16(%rdi), %rax +; NOPOSTRA-NEXT: movq (%rdi), %rcx +; NOPOSTRA-NEXT: movq 8(%rdi), %rdx +; NOPOSTRA-NEXT: movq 24(%rdi), %rsi +; NOPOSTRA-NEXT: orq 56(%rdi), %rsi +; NOPOSTRA-NEXT: orq 40(%rdi), %rdx +; NOPOSTRA-NEXT: orq 48(%rdi), %rax +; NOPOSTRA-NEXT: orq 32(%rdi), %rcx +; NOPOSTRA-NEXT: orq %rsi, %rdx +; NOPOSTRA-NEXT: orq %rax, %rcx +; NOPOSTRA-NEXT: orq %rdx, %rcx +; NOPOSTRA-NEXT: setne %al +; NOPOSTRA-NEXT: retq + %rem = and i64 %idx, 511 + %sh_prom = zext nneg i64 %rem to i512 + %shl = shl nuw i512 1, %sh_prom + %not = xor i512 %shl, -1 + %load = load i512, ptr %v, align 8 + %and = and i512 %load, %not + %conv2 = zext i1 %b to i512 + %shl4 = shl nuw i512 %conv2, %sh_prom + %or = or i512 %and, %shl4 + store i512 %or, ptr %v, align 8 + %cmp = icmp ne i512 %or, 0 + ret i1 %cmp +} |
