aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/sbc.ll392
-rw-r--r--llvm/test/CodeGen/AArch64/sve-extract-element.ll132
-rw-r--r--llvm/test/CodeGen/PowerPC/milicode32.ll56
-rw-r--r--llvm/test/CodeGen/PowerPC/milicode64.ll79
-rw-r--r--llvm/test/CodeGen/RISCV/attributes.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/features-info.ll1
-rw-r--r--llvm/test/CodeGen/RISCV/overflow-intrinsics.ll48
-rw-r--r--llvm/test/CodeGen/SPIRV/non_int_constant_null.ll25
-rw-r--r--llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll11
-rw-r--r--llvm/test/CodeGen/SPIRV/zero-length-array.ll13
-rw-r--r--llvm/test/CodeGen/WebAssembly/memory-interleave.ll278
-rw-r--r--llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll20
-rw-r--r--llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir2
-rw-r--r--llvm/test/CodeGen/X86/ldexp-avx512.ll1288
-rw-r--r--llvm/test/CodeGen/X86/pr166534.ll68
15 files changed, 2050 insertions, 367 deletions
diff --git a/llvm/test/CodeGen/AArch64/sbc.ll b/llvm/test/CodeGen/AArch64/sbc.ll
new file mode 100644
index 0000000..fff63c1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sbc.ll
@@ -0,0 +1,392 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck --check-prefixes=CHECK,CHECK-SD %s
+; RUN: llc < %s -global-isel | FileCheck --check-prefixes=CHECK,CHECK-GI %s
+
+target triple = "aarch64-none-linux-gnu"
+
+define i32 @test_basic_i32(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_basic_i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: cmp w0, w1
+; CHECK-SD-NEXT: sbc w0, w2, w3
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_basic_i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: cmp w0, w1
+; CHECK-GI-NEXT: sub w9, w2, w3
+; CHECK-GI-NEXT: cset w8, lo
+; CHECK-GI-NEXT: sub w0, w9, w8
+; CHECK-GI-NEXT: ret
+ %cc = icmp ult i32 %a, %b
+ %carry = zext i1 %cc to i32
+ %sub = sub i32 %x, %y
+ %res = sub i32 %sub, %carry
+ ret i32 %res
+}
+
+define i64 @test_basic_i64(i64 %a, i64 %b, i64 %x, i64 %y) {
+; CHECK-SD-LABEL: test_basic_i64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: cmp x0, x1
+; CHECK-SD-NEXT: sbc x0, x2, x3
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_basic_i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: cmp x0, x1
+; CHECK-GI-NEXT: sub x9, x2, x3
+; CHECK-GI-NEXT: cset w8, lo
+; CHECK-GI-NEXT: sub x0, x9, x8
+; CHECK-GI-NEXT: ret
+ %cc = icmp ult i64 %a, %b
+ %carry = zext i1 %cc to i64
+ %sub = sub i64 %x, %y
+ %res = sub i64 %sub, %carry
+ ret i64 %res
+}
+
+define i64 @test_mixed_i32_i64(i32 %a, i32 %b, i64 %x, i64 %y) {
+; CHECK-SD-LABEL: test_mixed_i32_i64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: cmp w0, w1
+; CHECK-SD-NEXT: sbc x0, x2, x3
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_mixed_i32_i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: cmp w0, w1
+; CHECK-GI-NEXT: sub x9, x2, x3
+; CHECK-GI-NEXT: cset w8, lo
+; CHECK-GI-NEXT: sub x0, x9, x8
+; CHECK-GI-NEXT: ret
+ %cc = icmp ult i32 %a, %b
+ %carry = zext i1 %cc to i64
+ %sub = sub i64 %x, %y
+ %res = sub i64 %sub, %carry
+ ret i64 %res
+}
+
+define i32 @test_mixed_i64_i32(i64 %a, i64 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_mixed_i64_i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: cmp x0, x1
+; CHECK-SD-NEXT: sbc w0, w2, w3
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_mixed_i64_i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: cmp x0, x1
+; CHECK-GI-NEXT: sub w9, w2, w3
+; CHECK-GI-NEXT: cset w8, lo
+; CHECK-GI-NEXT: sub w0, w9, w8
+; CHECK-GI-NEXT: ret
+ %cc = icmp ult i64 %a, %b
+ %carry = zext i1 %cc to i32
+ %sub = sub i32 %x, %y
+ %res = sub i32 %sub, %carry
+ ret i32 %res
+}
+
+define i32 @test_only_borrow(i32 %a, i32 %b, i32 %x) {
+; CHECK-SD-LABEL: test_only_borrow:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: cmp w0, w1
+; CHECK-SD-NEXT: sbc w0, w2, wzr
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_only_borrow:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: cmp w0, w1
+; CHECK-GI-NEXT: cset w8, lo
+; CHECK-GI-NEXT: sub w0, w2, w8
+; CHECK-GI-NEXT: ret
+ %cc = icmp ult i32 %a, %b
+ %carry = zext i1 %cc to i32
+ %res = sub i32 %x, %carry
+ ret i32 %res
+}
+
+define i32 @test_sext_add(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_sext_add:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: cmp w0, w1
+; CHECK-SD-NEXT: sbc w0, w2, w3
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_sext_add:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: cmp w0, w1
+; CHECK-GI-NEXT: sub w9, w2, w3
+; CHECK-GI-NEXT: cset w8, lo
+; CHECK-GI-NEXT: sbfx w8, w8, #0, #1
+; CHECK-GI-NEXT: add w0, w9, w8
+; CHECK-GI-NEXT: ret
+ %cc = icmp ult i32 %a, %b
+ %carry = sext i1 %cc to i32
+ %sub = sub i32 %x, %y
+ %res = add i32 %sub, %carry
+ ret i32 %res
+}
+
+; FIXME: This case could be supported with reversed operands to the CMP.
+define i32 @test_ugt(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_ugt:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: cmp w0, w1
+; CHECK-SD-NEXT: sub w8, w2, w3
+; CHECK-SD-NEXT: cset w9, hi
+; CHECK-SD-NEXT: sub w0, w8, w9
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_ugt:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: cmp w0, w1
+; CHECK-GI-NEXT: sub w9, w2, w3
+; CHECK-GI-NEXT: cset w8, hi
+; CHECK-GI-NEXT: sub w0, w9, w8
+; CHECK-GI-NEXT: ret
+ %cc = icmp ugt i32 %a, %b
+ %carry = zext i1 %cc to i32
+ %sub = sub i32 %x, %y
+ %res = sub i32 %sub, %carry
+ ret i32 %res
+}
+
+define i32 @test_unsupported_cc_slt(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_unsupported_cc_slt:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: cmp w0, w1
+; CHECK-SD-NEXT: sub w8, w2, w3
+; CHECK-SD-NEXT: cset w9, lt
+; CHECK-SD-NEXT: sub w0, w8, w9
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_unsupported_cc_slt:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: cmp w0, w1
+; CHECK-GI-NEXT: sub w9, w2, w3
+; CHECK-GI-NEXT: cset w8, lt
+; CHECK-GI-NEXT: sub w0, w9, w8
+; CHECK-GI-NEXT: ret
+ %cc = icmp slt i32 %a, %b
+ %carry = zext i1 %cc to i32
+ %sub = sub i32 %x, %y
+ %res = sub i32 %sub, %carry
+ ret i32 %res
+}
+
+define i32 @test_unsupported_cc_sgt(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_unsupported_cc_sgt:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: cmp w0, w1
+; CHECK-SD-NEXT: sub w8, w2, w3
+; CHECK-SD-NEXT: cset w9, gt
+; CHECK-SD-NEXT: sub w0, w8, w9
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_unsupported_cc_sgt:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: cmp w0, w1
+; CHECK-GI-NEXT: sub w9, w2, w3
+; CHECK-GI-NEXT: cset w8, gt
+; CHECK-GI-NEXT: sub w0, w9, w8
+; CHECK-GI-NEXT: ret
+ %cc = icmp sgt i32 %a, %b
+ %carry = zext i1 %cc to i32
+ %sub = sub i32 %x, %y
+ %res = sub i32 %sub, %carry
+ ret i32 %res
+}
+
+define i32 @test_multiple_setcc_uses(i32 %a, i32 %b, i32 %x) {
+; CHECK-SD-LABEL: test_multiple_setcc_uses:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: .cfi_offset w19, -8
+; CHECK-SD-NEXT: .cfi_offset w30, -16
+; CHECK-SD-NEXT: cmp w0, w1
+; CHECK-SD-NEXT: cset w0, lo
+; CHECK-SD-NEXT: sub w19, w2, w0
+; CHECK-SD-NEXT: bl use
+; CHECK-SD-NEXT: mov w0, w19
+; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_multiple_setcc_uses:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT: .cfi_offset w19, -8
+; CHECK-GI-NEXT: .cfi_offset w20, -16
+; CHECK-GI-NEXT: .cfi_offset w30, -32
+; CHECK-GI-NEXT: cmp w0, w1
+; CHECK-GI-NEXT: mov w19, w2
+; CHECK-GI-NEXT: cset w20, lo
+; CHECK-GI-NEXT: mov w0, w20
+; CHECK-GI-NEXT: bl use
+; CHECK-GI-NEXT: sub w0, w19, w20
+; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ret
+ %cc = icmp ult i32 %a, %b
+ %carry = zext i1 %cc to i32
+ %res = sub i32 %x, %carry
+ tail call void @use(i1 %cc)
+ ret i32 %res
+}
+
+define i32 @test_multiple_carry_uses(i32 %a, i32 %b, i32 %x) {
+; CHECK-SD-LABEL: test_multiple_carry_uses:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: .cfi_offset w19, -8
+; CHECK-SD-NEXT: .cfi_offset w30, -16
+; CHECK-SD-NEXT: cmp w0, w1
+; CHECK-SD-NEXT: cset w0, lo
+; CHECK-SD-NEXT: sub w19, w2, w0
+; CHECK-SD-NEXT: bl use
+; CHECK-SD-NEXT: mov w0, w19
+; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_multiple_carry_uses:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT: .cfi_offset w19, -8
+; CHECK-GI-NEXT: .cfi_offset w20, -16
+; CHECK-GI-NEXT: .cfi_offset w30, -32
+; CHECK-GI-NEXT: cmp w0, w1
+; CHECK-GI-NEXT: mov w19, w2
+; CHECK-GI-NEXT: cset w20, lo
+; CHECK-GI-NEXT: mov w0, w20
+; CHECK-GI-NEXT: bl use
+; CHECK-GI-NEXT: sub w0, w19, w20
+; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ret
+ %cc = icmp ult i32 %a, %b
+ %carry = zext i1 %cc to i32
+ %res = sub i32 %x, %carry
+ tail call void @use(i32 %carry)
+ ret i32 %res
+}
+
+define i32 @test_multiple_sub_uses(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_multiple_sub_uses:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: .cfi_offset w19, -8
+; CHECK-SD-NEXT: .cfi_offset w30, -16
+; CHECK-SD-NEXT: sub w8, w2, w3
+; CHECK-SD-NEXT: cmp w0, w1
+; CHECK-SD-NEXT: mov w0, w8
+; CHECK-SD-NEXT: sbc w19, w2, w3
+; CHECK-SD-NEXT: bl use
+; CHECK-SD-NEXT: mov w0, w19
+; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_multiple_sub_uses:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT: .cfi_offset w19, -8
+; CHECK-GI-NEXT: .cfi_offset w20, -16
+; CHECK-GI-NEXT: .cfi_offset w30, -32
+; CHECK-GI-NEXT: sub w19, w2, w3
+; CHECK-GI-NEXT: cmp w0, w1
+; CHECK-GI-NEXT: mov w0, w19
+; CHECK-GI-NEXT: cset w20, lo
+; CHECK-GI-NEXT: bl use
+; CHECK-GI-NEXT: sub w0, w19, w20
+; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ret
+ %cc = icmp ult i32 %a, %b
+ %carry = zext i1 %cc to i32
+ %sub = sub i32 %x, %y
+ %res = sub i32 %sub, %carry
+ tail call void @use(i32 %sub)
+ ret i32 %res
+}
+
+define i8 @test_i8(i8 %a, i8 %b, i8 %x, i8 %y) {
+; CHECK-SD-LABEL: test_i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: and w8, w0, #0xff
+; CHECK-SD-NEXT: cmp w8, w1, uxtb
+; CHECK-SD-NEXT: sbc w0, w2, w3
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: and w8, w0, #0xff
+; CHECK-GI-NEXT: sub w9, w2, w3
+; CHECK-GI-NEXT: cmp w8, w1, uxtb
+; CHECK-GI-NEXT: cset w8, lo
+; CHECK-GI-NEXT: sub w0, w9, w8
+; CHECK-GI-NEXT: ret
+ %cc = icmp ult i8 %a, %b
+ %carry = zext i1 %cc to i8
+ %sub = sub i8 %x, %y
+ %res = sub i8 %sub, %carry
+ ret i8 %res
+}
+
+define i16 @test_i16(i16 %a, i16 %b, i16 %x, i16 %y) {
+; CHECK-SD-LABEL: test_i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: and w8, w0, #0xffff
+; CHECK-SD-NEXT: cmp w8, w1, uxth
+; CHECK-SD-NEXT: sbc w0, w2, w3
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: and w8, w0, #0xffff
+; CHECK-GI-NEXT: sub w9, w2, w3
+; CHECK-GI-NEXT: cmp w8, w1, uxth
+; CHECK-GI-NEXT: cset w8, lo
+; CHECK-GI-NEXT: sub w0, w9, w8
+; CHECK-GI-NEXT: ret
+ %cc = icmp ult i16 %a, %b
+ %carry = zext i1 %cc to i16
+ %sub = sub i16 %x, %y
+ %res = sub i16 %sub, %carry
+ ret i16 %res
+}
+
+define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-SD-LABEL: test_v4i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: sub v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: cmhi v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: add v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: movi v4.4s, #1
+; CHECK-GI-NEXT: cmhi v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: sub v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT: sub v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: ret
+ %cc = icmp ult <4 x i32> %a, %b
+ %carry = zext <4 x i1> %cc to <4 x i32>
+ %sub = sub <4 x i32> %x, %y
+ %res = sub <4 x i32> %sub, %carry
+ ret <4 x i32> %res
+}
+
+declare void @use()
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
index c340df1..0cc2e04 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -12,6 +12,26 @@ define i8 @test_lane0_16xi8(<vscale x 16 x i8> %a) #0 {
ret i8 %b
}
+define i32 @test_lane0_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane0_16xi8_zext_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w0, v0.b[0]
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 0
+ %c = zext i8 %b to i32
+ ret i32 %c
+}
+
+define i64 @test_lane0_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane0_16xi8_zext_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w0, v0.b[0]
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 0
+ %c = zext i8 %b to i64
+ ret i64 %c
+}
+
define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 {
; CHECK-LABEL: test_lane15_16xi8:
; CHECK: // %bb.0:
@@ -21,6 +41,26 @@ define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 {
ret i8 %b
}
+define i32 @test_lane15_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane15_16xi8_zext_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w0, v0.b[15]
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 15
+ %c = zext i8 %b to i32
+ ret i32 %c
+}
+
+define i64 @test_lane15_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane15_16xi8_zext_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w0, v0.b[15]
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 15
+ %c = zext i8 %b to i64
+ ret i64 %c
+}
+
define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 {
; CHECK-LABEL: test_lane16_16xi8:
; CHECK: // %bb.0:
@@ -31,6 +71,32 @@ define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 {
ret i8 %b
}
+; FIXME: FMOV+AND -> UMOV.
+define i32 @test_lane16_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane16_16xi8_zext_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.b, z0.b[16]
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 16
+ %c = zext i8 %b to i32
+ ret i32 %c
+}
+
+; FIXME: FMOV+AND -> UMOV.
+define i64 @test_lane16_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane16_16xi8_zext_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.b, z0.b[16]
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and x0, x8, #0xff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 16
+ %c = zext i8 %b to i64
+ ret i64 %c
+}
+
define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: test_lane0_8xi16:
; CHECK: // %bb.0:
@@ -40,6 +106,26 @@ define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 {
ret i16 %b
}
+define i32 @test_lane0_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane0_8xi16_zext_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w0, v0.h[0]
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 0
+ %c = zext i16 %b to i32
+ ret i32 %c
+}
+
+define i64 @test_lane0_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane0_8xi16_zext_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w0, v0.h[0]
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 0
+ %c = zext i16 %b to i64
+ ret i64 %c
+}
+
define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: test_lane7_8xi16:
; CHECK: // %bb.0:
@@ -49,6 +135,26 @@ define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 {
ret i16 %b
}
+define i32 @test_lane7_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane7_8xi16_zext_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w0, v0.h[7]
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 7
+ %c = zext i16 %b to i32
+ ret i32 %c
+}
+
+define i64 @test_lane7_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane7_8xi16_zext_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w0, v0.h[7]
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 7
+ %c = zext i16 %b to i64
+ ret i64 %c
+}
+
define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: test_lane8_8xi16:
; CHECK: // %bb.0:
@@ -59,6 +165,32 @@ define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 {
ret i16 %b
}
+; FIXME: FMOV+AND -> UMOV.
+define i32 @test_lane8_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane8_8xi16_zext_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, z0.h[8]
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w0, w8, #0xffff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 8
+ %c = zext i16 %b to i32
+ ret i32 %c
+}
+
+; FIXME: FMOV+AND -> UMOV.
+define i64 @test_lane8_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane8_8xi16_zext_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, z0.h[8]
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and x0, x8, #0xffff
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 8
+ %c = zext i16 %b to i64
+ ret i64 %c
+}
+
define i32 @test_lane0_4xi32(<vscale x 4 x i32> %a) #0 {
; CHECK-LABEL: test_lane0_4xi32:
; CHECK: // %bb.0:
diff --git a/llvm/test/CodeGen/PowerPC/milicode32.ll b/llvm/test/CodeGen/PowerPC/milicode32.ll
index 78d0362..ddadd01 100644
--- a/llvm/test/CodeGen/PowerPC/milicode32.ll
+++ b/llvm/test/CodeGen/PowerPC/milicode32.ll
@@ -69,3 +69,59 @@ entry:
}
declare i32 @strlen(ptr noundef) nounwind
+
+define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i32 noundef %num) #0 {
+; CHECK-AIX-32-P9-LABEL: test_memmove:
+; CHECK-AIX-32-P9: # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT: mflr r0
+; CHECK-AIX-32-P9-NEXT: stwu r1, -80(r1)
+; CHECK-AIX-32-P9-NEXT: stw r0, 88(r1)
+; CHECK-AIX-32-P9-NEXT: stw r31, 76(r1) # 4-byte Folded Spill
+; CHECK-AIX-32-P9-NEXT: mr r31, r3
+; CHECK-AIX-32-P9-NEXT: stw r3, 72(r1)
+; CHECK-AIX-32-P9-NEXT: stw r4, 68(r1)
+; CHECK-AIX-32-P9-NEXT: stw r5, 64(r1)
+; CHECK-AIX-32-P9-NEXT: bl .___memmove[PR]
+; CHECK-AIX-32-P9-NEXT: nop
+; CHECK-AIX-32-P9-NEXT: mr r3, r31
+; CHECK-AIX-32-P9-NEXT: lwz r31, 76(r1) # 4-byte Folded Reload
+; CHECK-AIX-32-P9-NEXT: addi r1, r1, 80
+; CHECK-AIX-32-P9-NEXT: lwz r0, 8(r1)
+; CHECK-AIX-32-P9-NEXT: mtlr r0
+; CHECK-AIX-32-P9-NEXT: blr
+;
+; CHECK-LINUX32-P9-LABEL: test_memmove:
+; CHECK-LINUX32-P9: # %bb.0: # %entry
+; CHECK-LINUX32-P9-NEXT: mflr r0
+; CHECK-LINUX32-P9-NEXT: stwu r1, -32(r1)
+; CHECK-LINUX32-P9-NEXT: stw r0, 36(r1)
+; CHECK-LINUX32-P9-NEXT: .cfi_def_cfa_offset 32
+; CHECK-LINUX32-P9-NEXT: .cfi_offset lr, 4
+; CHECK-LINUX32-P9-NEXT: .cfi_offset r30, -8
+; CHECK-LINUX32-P9-NEXT: stw r30, 24(r1) # 4-byte Folded Spill
+; CHECK-LINUX32-P9-NEXT: mr r30, r3
+; CHECK-LINUX32-P9-NEXT: stw r3, 20(r1)
+; CHECK-LINUX32-P9-NEXT: stw r4, 16(r1)
+; CHECK-LINUX32-P9-NEXT: stw r5, 12(r1)
+; CHECK-LINUX32-P9-NEXT: bl memmove
+; CHECK-LINUX32-P9-NEXT: mr r3, r30
+; CHECK-LINUX32-P9-NEXT: lwz r30, 24(r1) # 4-byte Folded Reload
+; CHECK-LINUX32-P9-NEXT: lwz r0, 36(r1)
+; CHECK-LINUX32-P9-NEXT: addi r1, r1, 32
+; CHECK-LINUX32-P9-NEXT: mtlr r0
+; CHECK-LINUX32-P9-NEXT: blr
+entry:
+ %destination.addr = alloca ptr, align 4
+ %source.addr = alloca ptr, align 4
+ %num.addr = alloca i32, align 4
+ store ptr %destination, ptr %destination.addr, align 4
+ store ptr %source, ptr %source.addr, align 4
+ store i32 %num, ptr %num.addr, align 4
+ %0 = load ptr, ptr %destination.addr, align 4
+ %1 = load ptr, ptr %source.addr, align 4
+ %2 = load i32, ptr %num.addr, align 4
+ call void @llvm.memmove.p0.p0.i32(ptr align 1 %0, ptr align 1 %1, i32 %2, i1 false)
+ ret ptr %0
+}
+
+declare void @llvm.memmove.p0.p0.i32(ptr writeonly captures(none), ptr readonly captures(none), i32, i1 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/milicode64.ll b/llvm/test/CodeGen/PowerPC/milicode64.ll
index 8b87529..f7814a4 100644
--- a/llvm/test/CodeGen/PowerPC/milicode64.ll
+++ b/llvm/test/CodeGen/PowerPC/milicode64.ll
@@ -100,3 +100,82 @@ entry:
}
declare i64 @strlen(ptr noundef) nounwind
+
+define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i64 noundef %num) #0 {
+; CHECK-LE-P9-LABEL: test_memmove:
+; CHECK-LE-P9: # %bb.0: # %entry
+; CHECK-LE-P9-NEXT: mflr r0
+; CHECK-LE-P9-NEXT: .cfi_def_cfa_offset 80
+; CHECK-LE-P9-NEXT: .cfi_offset lr, 16
+; CHECK-LE-P9-NEXT: .cfi_offset r30, -16
+; CHECK-LE-P9-NEXT: std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-LE-P9-NEXT: stdu r1, -80(r1)
+; CHECK-LE-P9-NEXT: std r0, 96(r1)
+; CHECK-LE-P9-NEXT: mr r30, r3
+; CHECK-LE-P9-NEXT: std r3, 56(r1)
+; CHECK-LE-P9-NEXT: std r4, 48(r1)
+; CHECK-LE-P9-NEXT: std r5, 40(r1)
+; CHECK-LE-P9-NEXT: bl memmove
+; CHECK-LE-P9-NEXT: nop
+; CHECK-LE-P9-NEXT: mr r3, r30
+; CHECK-LE-P9-NEXT: addi r1, r1, 80
+; CHECK-LE-P9-NEXT: ld r0, 16(r1)
+; CHECK-LE-P9-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-LE-P9-NEXT: mtlr r0
+; CHECK-LE-P9-NEXT: blr
+;
+; CHECK-BE-P9-LABEL: test_memmove:
+; CHECK-BE-P9: # %bb.0: # %entry
+; CHECK-BE-P9-NEXT: mflr r0
+; CHECK-BE-P9-NEXT: stdu r1, -160(r1)
+; CHECK-BE-P9-NEXT: std r0, 176(r1)
+; CHECK-BE-P9-NEXT: .cfi_def_cfa_offset 160
+; CHECK-BE-P9-NEXT: .cfi_offset lr, 16
+; CHECK-BE-P9-NEXT: .cfi_offset r30, -16
+; CHECK-BE-P9-NEXT: std r30, 144(r1) # 8-byte Folded Spill
+; CHECK-BE-P9-NEXT: mr r30, r3
+; CHECK-BE-P9-NEXT: std r3, 136(r1)
+; CHECK-BE-P9-NEXT: std r4, 128(r1)
+; CHECK-BE-P9-NEXT: std r5, 120(r1)
+; CHECK-BE-P9-NEXT: bl memmove
+; CHECK-BE-P9-NEXT: nop
+; CHECK-BE-P9-NEXT: mr r3, r30
+; CHECK-BE-P9-NEXT: ld r30, 144(r1) # 8-byte Folded Reload
+; CHECK-BE-P9-NEXT: addi r1, r1, 160
+; CHECK-BE-P9-NEXT: ld r0, 16(r1)
+; CHECK-BE-P9-NEXT: mtlr r0
+; CHECK-BE-P9-NEXT: blr
+;
+; CHECK-AIX-64-P9-LABEL: test_memmove:
+; CHECK-AIX-64-P9: # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT: mflr r0
+; CHECK-AIX-64-P9-NEXT: stdu r1, -144(r1)
+; CHECK-AIX-64-P9-NEXT: std r0, 160(r1)
+; CHECK-AIX-64-P9-NEXT: std r31, 136(r1) # 8-byte Folded Spill
+; CHECK-AIX-64-P9-NEXT: mr r31, r3
+; CHECK-AIX-64-P9-NEXT: std r3, 128(r1)
+; CHECK-AIX-64-P9-NEXT: std r4, 120(r1)
+; CHECK-AIX-64-P9-NEXT: std r5, 112(r1)
+; CHECK-AIX-64-P9-NEXT: bl .memmove[PR]
+; CHECK-AIX-64-P9-NEXT: nop
+; CHECK-AIX-64-P9-NEXT: mr r3, r31
+; CHECK-AIX-64-P9-NEXT: ld r31, 136(r1) # 8-byte Folded Reload
+; CHECK-AIX-64-P9-NEXT: addi r1, r1, 144
+; CHECK-AIX-64-P9-NEXT: ld r0, 16(r1)
+; CHECK-AIX-64-P9-NEXT: mtlr r0
+; CHECK-AIX-64-P9-NEXT: blr
+entry:
+ %destination.addr = alloca ptr, align 8
+ %source.addr = alloca ptr, align 8
+ %num.addr = alloca i64, align 8
+ store ptr %destination, ptr %destination.addr, align 8
+ store ptr %source, ptr %source.addr, align 8
+ store i64 %num, ptr %num.addr, align 8
+ %0 = load ptr, ptr %destination.addr, align 8
+ %1 = load ptr, ptr %source.addr, align 8
+ %2 = load i64, ptr %num.addr, align 8
+ call void @llvm.memmove.p0.p0.i64(ptr align 1 %0, ptr align 1 %1, i64 %2, i1 false)
+ ret ptr %0
+}
+
+declare void @llvm.memmove.p0.p0.i32(ptr writeonly captures(none), ptr readonly captures(none), i32, i1 immarg)
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 22c2d81..f26d4f0 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -125,6 +125,7 @@
; RUN: llc -mtriple=riscv32 -mattr=+smcdeleg %s -o - | FileCheck --check-prefixes=CHECK,RV32SMCDELEG %s
; RUN: llc -mtriple=riscv32 -mattr=+smcntrpmf %s -o - | FileCheck --check-prefixes=CHECK,RV32SMCNTRPMF %s
; RUN: llc -mtriple=riscv32 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV32SMEPMP %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-smpmpmt %s -o - | FileCheck --check-prefixes=CHECK,RV32SMPMPMT %s
; RUN: llc -mtriple=riscv32 -mattr=+smrnmi %s -o - | FileCheck --check-prefixes=CHECK,RV32SMRNMI %s
; RUN: llc -mtriple=riscv32 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFBFMIN %s
; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfa %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFA %s
@@ -275,6 +276,7 @@
; RUN: llc -mtriple=riscv64 -mattr=+smcdeleg %s -o - | FileCheck --check-prefixes=CHECK,RV64SMCDELEG %s
; RUN: llc -mtriple=riscv64 -mattr=+smcntrpmf %s -o - | FileCheck --check-prefixes=CHECK,RV64SMCNTRPMF %s
; RUN: llc -mtriple=riscv64 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV64SMEPMP %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-smpmpmt %s -o - | FileCheck --check-prefixes=CHECK,RV64SMPMPMT %s
; RUN: llc -mtriple=riscv64 -mattr=+smrnmi %s -o - | FileCheck --check-prefixes=CHECK,RV64SMRNMI %s
; RUN: llc -mtriple=riscv64 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFBFMIN %s
; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfa %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFA %s
@@ -439,6 +441,7 @@
; RV32SMCDELEG: .attribute 5, "rv32i2p1_smcdeleg1p0"
; RV32SMCNTRPMF: .attribute 5, "rv32i2p1_smcntrpmf1p0"
; RV32SMEPMP: .attribute 5, "rv32i2p1_smepmp1p0"
+; RV32SMPMPMT: .attribute 5, "rv32i2p1_smpmpmt0p6"
; RV32SMRNMI: .attribute 5, "rv32i2p1_smrnmi1p0"
; RV32ZFBFMIN: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0"
; RV32ZVFBFA: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfa0p1_zvl32b1p0"
@@ -587,6 +590,7 @@
; RV64SMCDELEG: .attribute 5, "rv64i2p1_smcdeleg1p0"
; RV64SMCNTRPMF: .attribute 5, "rv64i2p1_smcntrpmf1p0"
; RV64SMEPMP: .attribute 5, "rv64i2p1_smepmp1p0"
+; RV64SMPMPMT: .attribute 5, "rv64i2p1_smpmpmt0p6"
; RV64SMRNMI: .attribute 5, "rv64i2p1_smrnmi1p0"
; RV64ZFBFMIN: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0"
; RV64ZVFBFA: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfa0p1_zvl32b1p0"
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index cf44af6..3d9906f 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -27,6 +27,7 @@
; CHECK-NEXT: experimental - Experimental intrinsics.
; CHECK-NEXT: experimental-p - 'P' ('Base P' (Packed SIMD)).
; CHECK-NEXT: experimental-rvm23u32 - RISC-V experimental-rvm23u32 profile.
+; CHECK-NEXT: experimental-smpmpmt - 'Smpmpmt' (PMP-based Memory Types Extension).
; CHECK-NEXT: experimental-svukte - 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses).
; CHECK-NEXT: experimental-xqccmp - 'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves).
; CHECK-NEXT: experimental-xqcia - 'Xqcia' (Qualcomm uC Arithmetic Extension).
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index ba6769b..0306bb1 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -232,7 +232,7 @@ define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
ret i64 %Q
}
-; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic.
+; Ensure CGP doesn't sink the compare before we have a chance to form the overflow intrinsic.
define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp {
; RV32-LABEL: uaddo4:
@@ -1076,41 +1076,37 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) {
; RV32-NEXT: .cfi_offset s4, -24
; RV32-NEXT: .cfi_offset s5, -28
; RV32-NEXT: .cfi_offset s6, -32
-; RV32-NEXT: mv s5, a5
-; RV32-NEXT: mv s3, a1
+; RV32-NEXT: mv s1, a5
+; RV32-NEXT: mv s4, a1
; RV32-NEXT: andi a1, a5, 1
-; RV32-NEXT: beqz a1, .LBB32_8
+; RV32-NEXT: beqz a1, .LBB32_6
; RV32-NEXT: # %bb.1: # %t
; RV32-NEXT: mv s0, a4
-; RV32-NEXT: mv s2, a3
-; RV32-NEXT: mv s1, a2
-; RV32-NEXT: mv s4, a0
-; RV32-NEXT: beq s3, a3, .LBB32_3
+; RV32-NEXT: mv s3, a3
+; RV32-NEXT: mv s2, a2
+; RV32-NEXT: mv s5, a0
+; RV32-NEXT: beq s4, a3, .LBB32_3
; RV32-NEXT: # %bb.2: # %t
-; RV32-NEXT: sltu s6, s3, s2
+; RV32-NEXT: sltu s6, s4, s3
; RV32-NEXT: j .LBB32_4
; RV32-NEXT: .LBB32_3:
-; RV32-NEXT: sltu s6, s4, s1
+; RV32-NEXT: sltu s6, s5, s2
; RV32-NEXT: .LBB32_4: # %t
; RV32-NEXT: mv a0, s6
; RV32-NEXT: call call
-; RV32-NEXT: beqz s6, .LBB32_8
+; RV32-NEXT: beqz s6, .LBB32_6
; RV32-NEXT: # %bb.5: # %end
-; RV32-NEXT: sltu a1, s4, s1
-; RV32-NEXT: mv a0, a1
-; RV32-NEXT: beq s3, s2, .LBB32_7
-; RV32-NEXT: # %bb.6: # %end
-; RV32-NEXT: sltu a0, s3, s2
-; RV32-NEXT: .LBB32_7: # %end
-; RV32-NEXT: sub a2, s3, s2
-; RV32-NEXT: sub a3, s4, s1
-; RV32-NEXT: sub a2, a2, a1
-; RV32-NEXT: sw a3, 0(s0)
-; RV32-NEXT: sw a2, 4(s0)
-; RV32-NEXT: j .LBB32_9
-; RV32-NEXT: .LBB32_8: # %f
-; RV32-NEXT: mv a0, s5
-; RV32-NEXT: .LBB32_9: # %f
+; RV32-NEXT: sltu a0, s5, s2
+; RV32-NEXT: sub a1, s4, s3
+; RV32-NEXT: sub a2, s5, s2
+; RV32-NEXT: sub a1, a1, a0
+; RV32-NEXT: sw a2, 0(s0)
+; RV32-NEXT: sw a1, 4(s0)
+; RV32-NEXT: mv a0, s6
+; RV32-NEXT: j .LBB32_7
+; RV32-NEXT: .LBB32_6: # %f
+; RV32-NEXT: mv a0, s1
+; RV32-NEXT: .LBB32_7: # %f
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll b/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll
new file mode 100644
index 0000000..0ba016a
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple spirv64-unknown-unknown %s --spirv-ext=+SPV_KHR_float_controls2 -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -mtriple spirv64-unknown-unknown %s --spirv-ext=+SPV_KHR_float_controls2 -o - -filetype=obj | spirv-val %}
+
+@A = addrspace(1) constant [1 x i8] zeroinitializer
+
+; CHECK: OpName %[[#FOO:]] "foo"
+; CHECK: OpName %[[#A:]] "A"
+; CHECK: OpDecorate %[[#A]] Constant
+; CHECK: OpDecorate %[[#A]] LinkageAttributes "A" Export
+; CHECK: %[[#INT8:]] = OpTypeInt 8 0
+; CHECK: %[[#INT32:]] = OpTypeInt 32 0
+; CHECK: %[[#ONE:]] = OpConstant %[[#INT32]] 1
+; CHECK: %[[#ARR_INT8:]] = OpTypeArray %[[#INT8]] %7
+; CHECK: %[[#ARR_INT8_PTR:]] = OpTypePointer CrossWorkgroup %[[#ARR_INT8]]
+; CHECK: %[[#ARR_INT8_ZERO:]] = OpConstantNull %[[#ARR_INT8]]
+; CHECK: %13 = OpVariable %[[#ARR_INT8_PTR]] CrossWorkgroup %[[#ARR_INT8_ZERO]]
+; CHECK: %[[#FOO]] = OpFunction
+; CHECK: = OpLabel
+; CHECK: OpReturn
+; CHECK: OpFunctionEnd
+
+define spir_kernel void @foo() {
+entry:
+ ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll b/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll
new file mode 100644
index 0000000..1d3ba2a
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: %5:vfid(<2 x s64>) = nnan ninf nsz arcp afn reassoc G_INTRINSIC intrinsic(@llvm.spv.unpackhalf2x16), %0:iid(s64) is only supported with the GLSL extended instruction set.
+
+define hidden spir_func noundef nofpclass(nan inf) float @_Z9test_funcj(i32 noundef %0) local_unnamed_addr #0 {
+ %2 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.spv.unpackhalf2x16.v2f32(i32 %0)
+ %3 = extractelement <2 x float> %2, i64 0
+ ret float %3
+}
+
diff --git a/llvm/test/CodeGen/SPIRV/zero-length-array.ll b/llvm/test/CodeGen/SPIRV/zero-length-array.ll
index 666176c..5fd94d2 100644
--- a/llvm/test/CodeGen/SPIRV/zero-length-array.ll
+++ b/llvm/test/CodeGen/SPIRV/zero-length-array.ll
@@ -1,10 +1,17 @@
; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - | FileCheck %s
; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - -filetype=obj | spirv-val %}
-; CHECK: %[[#type:]] = OpTypeInt 32 0
-; CHECK: %[[#ext:]] = OpConstant %[[#type]] 0
+; Nothing is generated, but compilation doesn't crash.
+; CHECK: OpName %[[#FOO:]] "foo"
+; CHECK: OpName %[[#RTM:]] "reg2mem alloca point"
+; CHECK: %[[#INT:]] = OpTypeInt 32 0
+; CHECK: %[[#RTM]] = OpConstant %[[#INT]] 0
+; CHECK: %[[#FOO]] = OpFunction
+; CHECK-NEXT: = OpLabel
+; CHECK-NEXT: OpReturn
+; CHECK-NEXT: OpFunctionEnd
-define spir_func void @_Z3foov() {
+define spir_func void @foo() {
entry:
%i = alloca [0 x i32], align 4
ret void
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 404db23..5d58ae2 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -1720,28 +1720,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: two_floats_two_bytes_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const 255, 255, 255, 255
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: v128.store64_lane
+; CHECK-NOT: v128.load
define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp22.not = icmp eq i32 %N, 0
@@ -1774,28 +1753,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: two_floats_two_bytes_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: f32x4.add
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const 255, 255, 255, 255
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: v128.store64_lane
+; CHECK-NOT: v128.load
define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp21.not = icmp eq i32 %N, 0
@@ -2347,64 +2305,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: four_floats_four_bytes_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const 255, 255, 255, 255
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle 0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31
-; CHECK: v128.store
+; CHECK-NOT: v128.load
define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp48.not = icmp eq i32 %N, 0
@@ -2453,64 +2354,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: four_floats_four_bytes_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const 255, 255, 255, 255
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.add
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.div
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle 0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31
-; CHECK: v128.store
+; CHECK-NOT: v128.load
define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp45.not = icmp eq i32 %N, 0
@@ -2757,62 +2601,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: four_floats_four_shorts_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const 65535, 65535, 65535, 65535
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
-; CHECK: v128.store
-; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27
-; CHECK: v128.store
+; CHECK-NOT: v128.load
define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp48.not = icmp eq i32 %N, 0
@@ -2861,62 +2650,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: four_floats_four_shorts_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const 65535, 65535, 65535, 65535
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.add
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.div
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
-; CHECK: v128.store
-; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27
-; CHECK: v128.store
+; CHECK-NOT: v128.load
define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp45.not = icmp eq i32 %N, 0
diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
index 1962dde..f2b4c49 100644
--- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
+++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
@@ -36,10 +36,10 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: Ltmp0:
+; CHECK-NEXT: Ltmp0: ## EH_LABEL
; CHECK-NEXT: ## implicit-def: $ebx
; CHECK-NEXT: calll __Znam
-; CHECK-NEXT: Ltmp1:
+; CHECK-NEXT: Ltmp1: ## EH_LABEL
; CHECK-NEXT: ## %bb.1: ## %bb11
; CHECK-NEXT: movl %eax, %esi
; CHECK-NEXT: movb $1, %al
@@ -58,13 +58,13 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
; CHECK-NEXT: jne LBB0_9
; CHECK-NEXT: ## %bb.10: ## %bb41
; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1
-; CHECK-NEXT: Ltmp2:
+; CHECK-NEXT: Ltmp2: ## EH_LABEL
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %esi, (%esp)
; CHECK-NEXT: calll _Pjii
-; CHECK-NEXT: Ltmp3:
+; CHECK-NEXT: Ltmp3: ## EH_LABEL
; CHECK-NEXT: ## %bb.11: ## %bb42
; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1
; CHECK-NEXT: xorl %eax, %eax
@@ -126,20 +126,20 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
; CHECK-NEXT: decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; CHECK-NEXT: jmp LBB0_8
; CHECK-NEXT: LBB0_18: ## %bb43
-; CHECK-NEXT: Ltmp5:
+; CHECK-NEXT: Ltmp5: ## EH_LABEL
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: calll _OnOverFlow
-; CHECK-NEXT: Ltmp6:
+; CHECK-NEXT: Ltmp6: ## EH_LABEL
; CHECK-NEXT: jmp LBB0_3
; CHECK-NEXT: LBB0_2: ## %bb29
-; CHECK-NEXT: Ltmp7:
+; CHECK-NEXT: Ltmp7: ## EH_LABEL
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: calll _OnOverFlow
-; CHECK-NEXT: Ltmp8:
+; CHECK-NEXT: Ltmp8: ## EH_LABEL
; CHECK-NEXT: LBB0_3: ## %bb30
; CHECK-NEXT: ud2
; CHECK-NEXT: LBB0_4: ## %bb20.loopexit
-; CHECK-NEXT: Ltmp4:
+; CHECK-NEXT: Ltmp4: ## EH_LABEL
; CHECK-NEXT: LBB0_9:
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: LBB0_6: ## %bb23
@@ -151,7 +151,7 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
; CHECK-NEXT: popl %ebp
; CHECK-NEXT: retl
; CHECK-NEXT: LBB0_5: ## %bb20.loopexit.split-lp
-; CHECK-NEXT: Ltmp9:
+; CHECK-NEXT: Ltmp9: ## EH_LABEL
; CHECK-NEXT: jmp LBB0_6
; CHECK-NEXT: Lfunc_end0:
bb:
diff --git a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir
index 348a290..2445306 100644
--- a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir
+++ b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir
@@ -55,7 +55,7 @@
!9 = !DILocalVariable(name: "4", scope: !5, file: !1, line: 4, type: !10)
!10 = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned)
!11 = !DILocation(line: 4, column: 1, scope: !5)
- !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+ !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !7)
...
---
diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll
index ea93a91..21491bc 100644
--- a/llvm/test/CodeGen/X86/ldexp-avx512.ll
+++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll
@@ -47,6 +47,187 @@ entry:
}
declare fp128 @ldexpl(fp128, i32) memory(none)
+define <8 x half> @test_ldexp_8xhalf(<8 x half> %x, <8 x i16> %exp) nounwind {
+; AVX512-LABEL: test_ldexp_8xhalf:
+; AVX512: # %bb.0:
+; AVX512-NEXT: subq $88, %rsp
+; AVX512-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm0
+; AVX512-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX512-NEXT: addq $88, %rsp
+; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: test_ldexp_8xhalf:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: subq $88, %rsp
+; AVX512VL-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm0
+; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,0]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX512VL-NEXT: addq $88, %rsp
+; AVX512VL-NEXT: retq
+ %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> %x, <8 x i16> %exp)
+ ret <8 x half> %r
+}
+declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
+
define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind {
; CHECK-LABEL: test_ldexp_4xfloat:
; CHECK: # %bb.0:
@@ -109,6 +290,381 @@ define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) nounwi
}
declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>)
+define <16 x half> @test_ldexp_16xhalf(<16 x half> %x, <16 x i16> %exp) nounwind {
+; AVX512-LABEL: test_ldexp_16xhalf:
+; AVX512: # %bb.0:
+; AVX512-NEXT: subq $168, %rsp
+; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX512-NEXT: addq $168, %rsp
+; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: test_ldexp_16xhalf:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: subq $168, %rsp
+; AVX512VL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,0]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,0]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512VL-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX512VL-NEXT: addq $168, %rsp
+; AVX512VL-NEXT: retq
+ %r = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> %x, <16 x i16> %exp)
+ ret <16 x half> %r
+}
+declare <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half>, <16 x i16>)
+
define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind {
; CHECK-LABEL: test_ldexp_8xfloat:
; CHECK: # %bb.0:
@@ -230,6 +786,735 @@ define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwi
}
declare <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>)
+define <32 x half> @test_ldexp_32xhalf(<32 x half> %x, <32 x i16> %exp) nounwind {
+; AVX512-LABEL: test_ldexp_32xhalf:
+; AVX512: # %bb.0:
+; AVX512-NEXT: subq $360, %rsp # imm = 0x168
+; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movswl %ax, %edi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq ldexpf@PLT
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; AVX512-NEXT: addq $360, %rsp # imm = 0x168
+; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: test_ldexp_32xhalf:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: subq $360, %rsp # imm = 0x168
+; AVX512VL-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512VL-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vextracti32x4 $3, %zmm1, %xmm1
+; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,0]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT: vextracti32x4 $2, %zmm1, %xmm1
+; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,0]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,0]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,0]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512VL-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: movswl %ax, %edi
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: callq ldexpf@PLT
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512VL-NEXT: # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; AVX512VL-NEXT: addq $360, %rsp # imm = 0x168
+; AVX512VL-NEXT: retq
+ %r = call <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half> %x, <32 x i16> %exp)
+ ret <32 x half> %r
+}
+declare <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half>, <32 x i16>)
+
define <16 x float> @test_ldexp_16xfloat(<16 x float> %x, <16 x i32> %exp) nounwind {
; CHECK-LABEL: test_ldexp_16xfloat:
; CHECK: # %bb.0:
@@ -462,6 +1747,3 @@ define <8 x double> @test_ldexp_8xdouble(<8 x double> %x, <8 x i32> %exp) nounwi
}
declare <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX512: {{.*}}
-; AVX512VL: {{.*}}
diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll
index aef44cc..162a0c9 100644
--- a/llvm/test/CodeGen/X86/pr166534.ll
+++ b/llvm/test/CodeGen/X86/pr166534.ll
@@ -7,100 +7,64 @@
define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
; SSE2-LABEL: pr166534:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq (%rdi), %rax
-; SSE2-NEXT: movq 8(%rdi), %r8
; SSE2-NEXT: movdqu (%rdi), %xmm0
-; SSE2-NEXT: movq (%rsi), %r9
-; SSE2-NEXT: movq 8(%rsi), %rdi
; SSE2-NEXT: movdqu (%rsi), %xmm1
; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %esi
-; SSE2-NEXT: xorl %r10d, %r10d
+; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF
-; SSE2-NEXT: sete %r10b
-; SSE2-NEXT: orq %r10, (%rdx)
+; SSE2-NEXT: sete %al
+; SSE2-NEXT: orq %rax, (%rdx)
; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF
; SSE2-NEXT: jne .LBB0_2
; SSE2-NEXT: # %bb.1: # %if.then
-; SSE2-NEXT: xorq %r9, %rax
-; SSE2-NEXT: xorq %rdi, %r8
-; SSE2-NEXT: xorl %edx, %edx
-; SSE2-NEXT: orq %rax, %r8
-; SSE2-NEXT: sete %dl
-; SSE2-NEXT: orq %rdx, (%rcx)
+; SSE2-NEXT: orq %rax, (%rcx)
; SSE2-NEXT: .LBB0_2: # %if.end
; SSE2-NEXT: retq
;
; SSE4-LABEL: pr166534:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movq (%rdi), %rax
-; SSE4-NEXT: movq 8(%rdi), %r8
; SSE4-NEXT: movdqu (%rdi), %xmm0
-; SSE4-NEXT: movq (%rsi), %r9
-; SSE4-NEXT: movq 8(%rsi), %rdi
; SSE4-NEXT: movdqu (%rsi), %xmm1
; SSE4-NEXT: pxor %xmm0, %xmm1
-; SSE4-NEXT: xorl %esi, %esi
+; SSE4-NEXT: xorl %eax, %eax
; SSE4-NEXT: ptest %xmm1, %xmm1
-; SSE4-NEXT: sete %sil
-; SSE4-NEXT: orq %rsi, (%rdx)
+; SSE4-NEXT: sete %al
+; SSE4-NEXT: orq %rax, (%rdx)
; SSE4-NEXT: ptest %xmm1, %xmm1
; SSE4-NEXT: jne .LBB0_2
; SSE4-NEXT: # %bb.1: # %if.then
-; SSE4-NEXT: xorq %r9, %rax
-; SSE4-NEXT: xorq %rdi, %r8
-; SSE4-NEXT: xorl %edx, %edx
-; SSE4-NEXT: orq %rax, %r8
-; SSE4-NEXT: sete %dl
-; SSE4-NEXT: orq %rdx, (%rcx)
+; SSE4-NEXT: orq %rax, (%rcx)
; SSE4-NEXT: .LBB0_2: # %if.end
; SSE4-NEXT: retq
;
; AVX2-LABEL: pr166534:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: movq (%rdi), %rax
-; AVX2-NEXT: movq 8(%rdi), %r8
; AVX2-NEXT: vmovdqu (%rdi), %xmm0
-; AVX2-NEXT: movq (%rsi), %rdi
; AVX2-NEXT: vpxor (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: movq 8(%rsi), %rsi
-; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: vptest %xmm0, %xmm0
-; AVX2-NEXT: sete %r9b
-; AVX2-NEXT: orq %r9, (%rdx)
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: orq %rax, (%rdx)
; AVX2-NEXT: vptest %xmm0, %xmm0
; AVX2-NEXT: jne .LBB0_2
; AVX2-NEXT: # %bb.1: # %if.then
-; AVX2-NEXT: xorq %rdi, %rax
-; AVX2-NEXT: xorq %rsi, %r8
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: orq %rax, %r8
-; AVX2-NEXT: sete %dl
-; AVX2-NEXT: orq %rdx, (%rcx)
+; AVX2-NEXT: orq %rax, (%rcx)
; AVX2-NEXT: .LBB0_2: # %if.end
; AVX2-NEXT: retq
;
; AVX512-LABEL: pr166534:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: movq (%rdi), %rax
-; AVX512-NEXT: movq 8(%rdi), %r8
; AVX512-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512-NEXT: movq (%rsi), %r9
-; AVX512-NEXT: movq 8(%rsi), %rdi
; AVX512-NEXT: vpxor (%rsi), %xmm0, %xmm0
-; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: xorl %eax, %eax
; AVX512-NEXT: vptest %xmm0, %xmm0
-; AVX512-NEXT: sete %sil
-; AVX512-NEXT: orq %rsi, (%rdx)
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: orq %rax, (%rdx)
; AVX512-NEXT: vptest %xmm0, %xmm0
; AVX512-NEXT: jne .LBB0_2
; AVX512-NEXT: # %bb.1: # %if.then
-; AVX512-NEXT: xorq %r9, %rax
-; AVX512-NEXT: xorq %rdi, %r8
-; AVX512-NEXT: xorl %edx, %edx
-; AVX512-NEXT: orq %rax, %r8
-; AVX512-NEXT: sete %dl
-; AVX512-NEXT: orq %rdx, (%rcx)
+; AVX512-NEXT: orq %rax, (%rcx)
; AVX512-NEXT: .LBB0_2: # %if.end
; AVX512-NEXT: retq
entry: